https://www.rstudio.com/products/rstudio/download/
RMarkdownに必要なパッケージのインストール
分析対象データをフォルダーにまとめて入れておく。
C:\Users\...\Rtext\NICER1_3_2\NICER_NNS
* 学習者データのフォルダー名 NICER_NNS を確認
# 作業ディレクトリーの確認
getwd()
| 指標 | 説明 |
|---|---|
| i | ファイル名 |
| Score | Criterionスコア |
| Tokens | 総語数 |
| Types | 異なり語数 |
| NoS | 文数 |
| TTR | 語彙多様性 |
| GI | 語彙多様性 |
| MATTR | 語彙多様性 |
| AWL | 平均単語長(文字数) |
| ASL | 平均文長(単語数) |
# 上の言語特徴を抽出するスクリプト
# 参照
# https://sugiura-ken.org/wiki/wiki.cgi/exp?page=myIndices4%2ER
# https://sugiura-ken.org/wiki/wiki.cgi/exp?page=myIndexNICER3
# copyleft 2019-01-17 sugiura@nagoya-u.jp
result <- list() # 結果をリストに保存
files <- list.files()
k <- 0 # リスト用インデックス
for (i in files) {
k <- k + 1 # リスト用インデックス
Tokens <- 0
Types <- 0
NoS <- 0
TTR <- 0
GI <- 0
AWL <- 0
ASL <- 0
wttr <- 0
ttrsum <- 0
MATTR <- 0
Score <- 0
Score <- as.character(Score) # 文字データと数値データの変換繰り返す
Score <- ""
lines.tmp <- scan(i, what="char", sep="\n")
#ファイルを選択。
Criterion <- grep("@Criterion:\t", lines.tmp, value=T)
Score <- gsub("@Criterion:\t", "", Criterion)
Score <- as.integer(Score)
data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
#*で始まり
# JPNかNS があって、
# その後ろに、3文字あって、
# その後ろに、コロンの記号とタブ記号がある行のみ。
body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
#行頭の記号とタブ記号を削除。
body.tmp <- body.tmp[body.tmp != ""]
# 空の要素を削除(空でない要素のみを残す)する「イディオム」
body.lower <- tolower(body.tmp)
# 小文字にして
body.nopunc <- gsub("\\W", " ", body.lower)
# 記号をスペースに
body.single <- gsub(" +", " ", body.nopunc)
# 重複スペースを一つに
body.clear <- gsub(" $", "", body.single)
# 文末スペースの削除
body.token <- unlist(strsplit(body.clear, " "))
#
body.token <- body.token[body.token != ""]
# 空の要素を削除する「イディオム」
body.type <- unique(body.token)
Tokens <- length(body.token)
Types <- length(body.type)
NoS <- length(body.tmp)
TTR <- Types/Tokens
GI <- Types/sqrt(Tokens)
AWL <- nchar(paste(body.token, collapse=""))/Tokens
ASL <- Tokens/NoS
body.token2 <- c(body.token, body.token)
for(j in 1:Tokens){
mado <- body.token2[j:(99+j)]
wttr <- length(unique(sort(mado)))/100
ttrsum <- ttrsum + wttr
}
MATTR <- ttrsum/Tokens
result[[k]] <- list(i, Score, Tokens, Types, NoS, TTR, GI, MATTR, AWL, ASL)
}
result.df <- as.data.frame(t(matrix(unlist(result), nrow=10)))
colnames(result.df) <- c("ID", "Score", "Token", "Type", "NoS", "TTR", "GI", "MATTR", "AWL","ASL")
head(result.df)
## ID Score Token Type NoS TTR GI
## 1 JPN501.txt 4 319 134 30 0.420062695924765 7.50255964680845
## 2 JPN502.txt 4 356 160 29 0.449438202247191 8.47998304005088
## 3 JPN503.txt 3 201 121 13 0.601990049751244 8.53468195188904
## 4 JPN504.txt 4 260 139 27 0.534615384615385 8.62041405394999
## 5 JPN505.txt 4 420 175 25 0.416666666666667 8.53912563829967
## 6 JPN506.txt 3 261 124 20 0.475095785440613 7.67540731131814
## MATTR AWL ASL
## 1 0.589090909090909 4.30407523510972 10.6333333333333
## 2 0.664241573033708 4.23314606741573 12.2758620689655
## 3 0.717014925373134 4.74626865671642 15.4615384615385
## 4 0.684153846153844 4.76538461538462 9.62962962962963
## 5 0.634190476190476 3.9952380952381 16.8
## 6 0.639003831417626 4.0727969348659 13.05
str(result.df)
## 'data.frame': 381 obs. of 10 variables:
## $ ID : chr "JPN501.txt" "JPN502.txt" "JPN503.txt" "JPN504.txt" ...
## $ Score: chr "4" "4" "3" "4" ...
## $ Token: chr "319" "356" "201" "260" ...
## $ Type : chr "134" "160" "121" "139" ...
## $ NoS : chr "30" "29" "13" "27" ...
## $ TTR : chr "0.420062695924765" "0.449438202247191" "0.601990049751244" "0.534615384615385" ...
## $ GI : chr "7.50255964680845" "8.47998304005088" "8.53468195188904" "8.62041405394999" ...
## $ MATTR: chr "0.589090909090909" "0.664241573033708" "0.717014925373134" "0.684153846153844" ...
## $ AWL : chr "4.30407523510972" "4.23314606741573" "4.74626865671642" "4.76538461538462" ...
## $ ASL : chr "10.6333333333333" "12.2758620689655" "15.4615384615385" "9.62962962962963" ...
result.df$Score <- as.numeric(result.df$Score)
result.df$Token <- as.numeric(result.df$Token)
result.df$Type <- as.numeric(result.df$Type)
result.df$NoS <- as.numeric(result.df$NoS)
result.df$TTR <- as.numeric(result.df$TTR)
result.df$GI <- as.numeric(result.df$GI)
result.df$MATTR <- as.numeric(result.df$MATTR)
result.df$AWL <- as.numeric(result.df$AWL)
result.df$ASL <- as.numeric(result.df$ASL)
summary(result.df)
## ID Score Token Type
## Length:381 Min. :1.000 Min. : 90.0 Min. : 50.0
## Class :character 1st Qu.:3.000 1st Qu.:212.0 1st Qu.:102.0
## Mode :character Median :3.000 Median :265.0 Median :123.0
## Mean :3.522 Mean :279.1 Mean :126.2
## 3rd Qu.:4.000 3rd Qu.:326.0 3rd Qu.:146.0
## Max. :5.000 Max. :736.0 Max. :252.0
## NA's :2
## NoS TTR GI MATTR
## Min. : 7.00 Min. :0.2504 Min. : 4.499 Min. :0.4360
## 1st Qu.:17.00 1st Qu.:0.4201 1st Qu.: 6.947 1st Qu.:0.6047
## Median :21.00 Median :0.4658 Median : 7.502 Median :0.6346
## Mean :22.07 Mean :0.4661 Mean : 7.571 Mean :0.6321
## 3rd Qu.:26.00 3rd Qu.:0.5123 3rd Qu.: 8.287 3rd Qu.:0.6637
## Max. :51.00 Max. :0.6581 Max. :10.394 Max. :0.7579
##
## AWL ASL
## Min. :3.372 Min. : 7.04
## 1st Qu.:4.110 1st Qu.:11.00
## Median :4.368 Median :12.33
## Mean :4.362 Mean :12.87
## 3rd Qu.:4.612 3rd Qu.:14.35
## Max. :5.388 Max. :24.26
##
pairs(result.df[-1])
library(PerformanceAnalytics)
chart.Correlation(result.df[-1])
result.df2 <- na.omit(result.df)
# すべての変数を入れてみる
lm.result <- lm(Score ~ Token + Type + NoS + TTR + GI + MATTR + AWL + ASL, data=result.df2)
# 結果概要
summary(lm.result)
##
## Call:
## lm(formula = Score ~ Token + Type + NoS + TTR + GI + MATTR +
## AWL + ASL, data = result.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24479 -0.24936 -0.01181 0.25896 1.48163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.510959 0.627301 -0.815 0.41586
## Token 0.002897 0.002160 1.341 0.18076
## Type -0.011851 0.010594 -1.119 0.26401
## NoS 0.014398 0.013959 1.031 0.30301
## TTR -6.488672 1.259864 -5.150 4.24e-07 ***
## GI 0.662440 0.255294 2.595 0.00984 **
## MATTR -0.598291 1.338171 -0.447 0.65507
## AWL 0.508490 0.060279 8.436 7.50e-16 ***
## ASL 0.044242 0.024013 1.842 0.06622 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3705 on 370 degrees of freedom
## Multiple R-squared: 0.7697, Adjusted R-squared: 0.7647
## F-statistic: 154.6 on 8 and 370 DF, p-value: < 2.2e-16
# 変数を順に減らしてAICで最適なモデルの自動選択
step(lm.result)
## Start: AIC=-743.7
## Score ~ Token + Type + NoS + TTR + GI + MATTR + AWL + ASL
##
## Df Sum of Sq RSS AIC
## - MATTR 1 0.0274 50.822 -745.49
## - NoS 1 0.1460 50.940 -744.61
## - Type 1 0.1718 50.966 -744.42
## - Token 1 0.2469 51.041 -743.86
## <none> 50.794 -743.70
## - ASL 1 0.4660 51.260 -742.24
## - GI 1 0.9243 51.719 -738.86
## - TTR 1 3.6415 54.436 -719.46
## - AWL 1 9.7688 60.563 -679.03
##
## Step: AIC=-745.49
## Score ~ Token + Type + NoS + TTR + GI + AWL + ASL
##
## Df Sum of Sq RSS AIC
## - Type 1 0.1444 50.966 -746.42
## - NoS 1 0.1495 50.971 -746.38
## - Token 1 0.2195 51.041 -745.86
## <none> 50.822 -745.49
## - ASL 1 0.4547 51.276 -744.12
## - GI 1 1.0430 51.865 -739.79
## - TTR 1 3.7234 54.545 -720.70
## - AWL 1 9.8231 60.645 -680.52
##
## Step: AIC=-746.42
## Score ~ Token + NoS + TTR + GI + AWL + ASL
##
## Df Sum of Sq RSS AIC
## - Token 1 0.0763 51.042 -747.85
## - NoS 1 0.2083 51.174 -746.87
## <none> 50.966 -746.42
## - ASL 1 0.5571 51.523 -744.30
## - TTR 1 4.6859 55.652 -715.08
## - GI 1 5.8079 56.774 -707.52
## - AWL 1 9.9838 60.950 -680.62
##
## Step: AIC=-747.85
## Score ~ NoS + TTR + GI + AWL + ASL
##
## Df Sum of Sq RSS AIC
## <none> 51.042 -747.85
## - NoS 1 0.9726 52.015 -742.70
## - ASL 1 1.9347 52.977 -735.75
## - TTR 1 5.7852 56.828 -709.16
## - GI 1 7.0603 58.103 -700.75
## - AWL 1 9.9418 60.984 -682.40
##
## Call:
## lm(formula = Score ~ NoS + TTR + GI + AWL + ASL, data = result.df2)
##
## Coefficients:
## (Intercept) NoS TTR GI AWL ASL
## -0.26947 0.02456 -5.98569 0.40258 0.50669 0.06055
# 最後に選ばれたモデル
lm.result.best <- lm(Score ~ NoS + TTR + GI + AWL + ASL, data = result.df2)
library(car)
## 要求されたパッケージ carData をロード中です
vif(lm.result.best)
## NoS TTR GI AWL ASL
## 11.392371 10.030198 8.154469 1.114866 5.487062
lm.result.best2 <- lm(Score ~ NoS + GI + AWL + ASL, data = result.df2)
vif(lm.result.best2)
## NoS GI AWL ASL
## 1.478281 1.425651 1.104070 1.210935
# 最適と考えられるモデルの概要
summary(lm.result.best2)
##
## Call:
## lm(formula = Score ~ NoS + GI + AWL + ASL, data = result.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42963 -0.24370 0.00343 0.26244 1.21389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.809310 0.299922 -9.367 < 2e-16 ***
## NoS 0.080428 0.003496 23.003 < 2e-16 ***
## GI 0.071546 0.024694 2.897 0.00399 **
## AWL 0.468652 0.062336 7.518 4.15e-13 ***
## ASL 0.152972 0.007971 19.191 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3898 on 374 degrees of freedom
## Multiple R-squared: 0.7423, Adjusted R-squared: 0.7396
## F-statistic: 269.4 on 4 and 374 DF, p-value: < 2.2e-16
library(rpart)
DT.result <- rpart(Score ~ Token + Type + NoS + TTR + GI + MATTR + AWL + ASL, data=result.df2)
# 見やすいグラフを表示するパッケージのインストール
install.packages("partykit")
library(partykit)
plot(as.party(DT.result))