*disclaimer
1198388
myIndexNICER3
ファイル名・スコア・言語指標を取り出す
- NICERのCHATフォーマットのデータ(サンプルとして50)の入っているフォルダー内で、
- 各種言語指標とファイル名とエッセイのスコアを取り出して、「リスト」として出力する。
- (catでテキストファイルに出力したほうが簡単)
myIndexNICER3 <- function(){
result <- list()
files <- list.files()
k <- 0
for (i in files){
k <- k +1
Token <- 0
Type <- 0
NoS <- 0
TTR <- 0
GI <- 0
AWL <- 0
ASL <- 0
wttr <- 0
ttrsum <- 0
MATTR <- 0
lines.tmp <- scan(i, what="char", sep="\n", quiet = T)
#Criterion score
criterion.tmp <- grep("@Criterion", lines.tmp, value = T)
Score <- gsub("@Criterion:\t", "", criterion.tmp)
#----NICER
data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
body.tmp <- body.tmp[body.tmp != ""]
#----
lines.lower <- tolower(body.tmp)
words.tmp <- unlist(strsplit(lines.lower, "\\W+"))
Token <- length(words.tmp)
Type <- length(unique(words.tmp))
NoS <- length(lines.tmp)
TTR <- Type/Token
GI <- Type/sqrt(Token)
AWL <- nchar(paste(words.tmp, collapse=""))/Token
ASL <- Token/NoS
words.tmp2 <- c(words.tmp, words.tmp)
for (j in 1:Token){
mado <- words.tmp2[j:(99+j)]
wttr <- length(unique(sort(mado)))/100
ttrsum <- ttrsum + wttr
}
MATTR <- ttrsum/Token
#cat(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL, "\n", file=output.file, append=T)
#cat(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL, "\n")
#print(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL)
result[[k]] <- list(i, Score, Token, Type, NoS, TTR, GI, MATTR, AWL, ASL)
}
return(result)
}
> JP_result <- myIndexNICER3() > str(JP_result) List of 50 $ :List of 10 ..$ : chr "JPN501.txt" ..$ : chr "4" ..$ : int 319 ..$ : int 135 ..$ : int 123 ..$ : num 0.423 ..$ : num 7.56 ..$ : num 0.592 ..$ : num 4.3 ..$ : num 2.59 $ :List of 10 ..$ : chr "JPN502.txt" ..$ : chr "4" ..$ : int 356 ..$ : int 161 ..$ : int 120 ..$ : num 0.452 ..$ : num 8.53 ..$ : num 0.665 ..$ : num 4.23 ..$ : num 2.97 $ :List of 10 ..$ : chr "JPN503.txt" ..$ : chr "3" ..$ : int 201 ..$ : int 121 (以下略)
その後の処理
unlistして、matrixとして一覧表にする。
> matrix(unlist(JP_result), nrow=10, ncol=50)
転置行列(縦横変換) t() (transpose)
> t(matrix(unlist(JP_result), nrow=10, ncol=50))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "JPN501.txt" "4" "319" "135" "123" "0.423197492163009" "7.55854889790403" "0.592131661442006" "4.30407523510972" "2.59349593495935"
[2,] "JPN502.txt" "4" "356" "161" "120" "0.452247191011236" "8.5329829340512" "0.664915730337079" "4.23314606741573" "2.96666666666667"
[3,] "JPN503.txt" "3" "201" "121" "70" "0.601990049751244" "8.53468195188904" "0.717014925373134" "4.74626865671642" "2.87142857142857"
[4,] "JPN504.txt" "4" "260" "140" "114" "0.538461538461538" "8.68243142124459" "0.687769230769229" "4.76153846153846" "2.28070175438596"
[5,] "JPN505.txt" "4" "420" "175" "106" "0.416666666666667" "8.53912563829967" "0.634190476190476" "3.9952380952381" "3.9622641509434"
[6,] "JPN506.txt" "3" "261" "124" "93" "0.475095785440613" "7.67540731131814" "0.639003831417626" "4.0727969348659" "2.80645161290323"
データフレームに
> as.data.frame(t(matrix(unlist(JP_result), nrow=10, ncol=50)))
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
1 JPN501.txt 4 319 135 123 0.423197492163009 7.55854889790403 0.592131661442006 4.30407523510972 2.59349593495935
2 JPN502.txt 4 356 161 120 0.452247191011236 8.5329829340512 0.664915730337079 4.23314606741573 2.96666666666667
3 JPN503.txt 3 201 121 70 0.601990049751244 8.53468195188904 0.717014925373134 4.74626865671642 2.87142857142857
4 JPN504.txt 4 260 140 114 0.538461538461538 8.68243142124459 0.687769230769229 4.76153846153846 2.28070175438596
5 JPN505.txt 4 420 175 106 0.416666666666667 8.53912563829967 0.634190476190476 3.9952380952381 3.9622641509434
6 JPN506.txt 3 261 124 93 0.475095785440613 7.67540731131814 0.639003831417626 4.0727969348659 2.80645161290323
> JP50.df <- as.data.frame(t(matrix(unlist(JP_result), nrow=10, ncol=50)))
> str(JP50.df)
'data.frame': 50 obs. of 10 variables:
$ V1 : Factor w/ 50 levels "JPN501.txt","JPN502.txt",..: 1 2 3 4 5 6 7 8 9 10 ...
$ V2 : Factor w/ 4 levels "2","3","4","5": 3 3 2 3 3 2 3 2 3 2 ...
$ V3 : Factor w/ 45 levels "168","183","187",..: 25 31 8 16 39 17 32 7 18 2 ...
$ V4 : Factor w/ 39 levels "100","103","104",..: 14 23 10 18 28 11 21 38 3 39 ...
$ V5 : Factor w/ 35 levels "102","105","106",..: 14 13 22 9 3 30 6 30 28 23 ...
$ V6 : Factor w/ 50 levels "0.353115727002967",..: 15 26 50 44 12 33 13 36 10 46 ...
$ V7 : Factor w/ 50 levels "6.06974962128721",..: 23 35 36 43 37 26 29 9 4 16 ...
$ V8 : Factor w/ 50 levels "0.539656862745098",..: 6 39 49 46 24 27 34 17 3 38 ...
$ V9 : Factor w/ 49 levels "3.78571428571429",..: 23 21 44 45 9 12 22 34 10 27 ...
$ V10: Factor w/ 50 levels "1.93103448275862",..: 22 32 29 12 48 27 37 7 31 17 ...
https://sugiura-ken.org/wiki/