R R.scripts !!!myIndexNICER3 {{outline}} !!ファイル名・スコア・言語指標を取り出す *NICERのCHATフォーマットのデータ(サンプルとして50)の入っているフォルダー内で、 *各種言語指標とファイル名とエッセイのスコアを取り出して、「リスト」として出力する。 *(catでテキストファイルに出力したほうが簡単) {{pre myIndexNICER3 <- function(){ result <- list() files <- list.files() k <- 0 for (i in files){ k <- k +1 Token <- 0 Type <- 0 NoS <- 0 TTR <- 0 GI <- 0 AWL <- 0 ASL <- 0 wttr <- 0 ttrsum <- 0 MATTR <- 0 lines.tmp <- scan(i, what="char", sep="\n", quiet = T) #Criterion score criterion.tmp <- grep("@Criterion", lines.tmp, value = T) Score <- gsub("@Criterion:\t", "", criterion.tmp) #----NICER data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T) body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp) body.tmp <- body.tmp[body.tmp != ""] #---- lines.lower <- tolower(body.tmp) words.tmp <- unlist(strsplit(lines.lower, "\\W+")) Token <- length(words.tmp) Type <- length(unique(words.tmp)) NoS <- length(lines.tmp) TTR <- Type/Token GI <- Type/sqrt(Token) AWL <- nchar(paste(words.tmp, collapse=""))/Token ASL <- Token/NoS words.tmp2 <- c(words.tmp, words.tmp) for (j in 1:Token){ mado <- words.tmp2[j:(99+j)] wttr <- length(unique(sort(mado)))/100 ttrsum <- ttrsum + wttr } MATTR <- ttrsum/Token #cat(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL, "\n", file=output.file, append=T) #cat(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL, "\n") #print(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL) result[[k]] <- list(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL) } return(result) } }} {{pre > JP_result <- myIndexNICER3() > str(JP_result) List of 50 $ :List of 10 ..$ : chr "JPN501.txt" ..$ : chr "4" ..$ : int 319 ..$ : int 135 ..$ : int 123 ..$ : num 0.423 ..$ : num 7.56 ..$ : num 0.592 ..$ : num 4.3 ..$ : num 2.59 $ :List of 10 ..$ : chr "JPN502.txt" ..$ : chr "4" ..$ : int 356 ..$ : int 161 ..$ : int 120 ..$ : num 0.452 ..$ : num 8.53 ..$ : num 0.665 ..$ : num 4.23 ..$ : num 2.97 $ :List of 10 ..$ : chr "JPN503.txt" ..$ : chr "3" ..$ : int 201 ..$ : int 121 (以下略) }} !!その後の処理 !unlistして、matrixとして一覧表にする。 > matrix(unlist(JP_result), nrow=10, ncol=50) !転置行列(縦横変換) t() (transpose) {{pre > t(matrix(unlist(JP_result), nrow=10, ncol=50)) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] "JPN501.txt" "4" "319" "135" "123" "0.423197492163009" "7.55854889790403" "0.592131661442006" "4.30407523510972" "2.59349593495935" [2,] "JPN502.txt" "4" "356" "161" "120" "0.452247191011236" "8.5329829340512" "0.664915730337079" "4.23314606741573" "2.96666666666667" [3,] "JPN503.txt" "3" "201" "121" "70" "0.601990049751244" "8.53468195188904" "0.717014925373134" "4.74626865671642" "2.87142857142857" [4,] "JPN504.txt" "4" "260" "140" "114" "0.538461538461538" "8.68243142124459" "0.687769230769229" "4.76153846153846" "2.28070175438596" [5,] "JPN505.txt" "4" "420" "175" "106" "0.416666666666667" "8.53912563829967" "0.634190476190476" "3.9952380952381" "3.9622641509434" [6,] "JPN506.txt" "3" "261" "124" "93" "0.475095785440613" "7.67540731131814" "0.639003831417626" "4.0727969348659" "2.80645161290323" }} !データフレームに {{pre > as.data.frame(t(matrix(unlist(JP_result), nrow=10, ncol=50))) V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 1 JPN501.txt 4 319 135 123 0.423197492163009 7.55854889790403 0.592131661442006 4.30407523510972 2.59349593495935 2 JPN502.txt 4 356 161 120 0.452247191011236 8.5329829340512 0.664915730337079 4.23314606741573 2.96666666666667 3 JPN503.txt 3 201 121 70 0.601990049751244 8.53468195188904 0.717014925373134 4.74626865671642 2.87142857142857 4 JPN504.txt 4 260 140 114 0.538461538461538 8.68243142124459 0.687769230769229 4.76153846153846 2.28070175438596 5 JPN505.txt 4 420 175 106 0.416666666666667 8.53912563829967 0.634190476190476 3.9952380952381 3.9622641509434 6 JPN506.txt 3 261 124 93 0.475095785440613 7.67540731131814 0.639003831417626 4.0727969348659 2.80645161290323 > JP50.df <- as.data.frame(t(matrix(unlist(JP_result), nrow=10, ncol=50))) > str(JP50.df) 'data.frame': 50 obs. of 10 variables: $ V1 : Factor w/ 50 levels "JPN501.txt","JPN502.txt",..: 1 2 3 4 5 6 7 8 9 10 ... $ V2 : Factor w/ 4 levels "2","3","4","5": 3 3 2 3 3 2 3 2 3 2 ... $ V3 : Factor w/ 45 levels "168","183","187",..: 25 31 8 16 39 17 32 7 18 2 ... $ V4 : Factor w/ 39 levels "100","103","104",..: 14 23 10 18 28 11 21 38 3 39 ... $ V5 : Factor w/ 35 levels "102","105","106",..: 14 13 22 9 3 30 6 30 28 23 ... $ V6 : Factor w/ 50 levels "0.353115727002967",..: 15 26 50 44 12 33 13 36 10 46 ... $ V7 : Factor w/ 50 levels "6.06974962128721",..: 23 35 36 43 37 26 29 9 4 16 ... $ V8 : Factor w/ 50 levels "0.539656862745098",..: 6 39 49 46 24 27 34 17 3 38 ... $ V9 : Factor w/ 49 levels "3.78571428571429",..: 23 21 44 45 9 12 22 34 10 27 ... $ V10: Factor w/ 50 levels "1.93103448275862",..: 22 32 29 12 48 27 37 7 31 17 ... }} !この後、データの型の変換必要