myIndex.df <- function(){ # 独自の命令の名前は変えておきましょう
topicV <- NULL # topic用
scoreV <- NULL # score用
fileV <- NULL
typeV <- NULL
tokenV <- NULL
TTRV <- NULL
GIV <- NULL
NoSV <- NULL
ASLV <- NULL
AWLV <- NULL
file.zenbu <- list.files() #
ruiseki <- "" #
for (i in file.zenbu){ #
yomikomi <- readLines(i, warn=F) #
topic.tmp <- grep("@Topic:", yomikomi, value=T) # Topicの行
topic <- gsub("@Topic:\t", "", topic.tmp) # 不要部分削除
score.tmp <- grep("@Criterion", yomikomi, value=T) # Scoreの行
score <- gsub("@Criterion:\t", "", score.tmp) # 不要部分削除
tmp1 <- grep("\\*(JPN|NS)", yomikomi, value=T) #
tmp2 <- gsub("\\*(JPN|NS)...:\t", "", tmp1) #
tmp2b <- gsub("[[:punct:]]", "", tmp2) #
tmp2c <- tolower(tmp2b) #
tmp3 <- strsplit(tmp2c, " ") #
tmp4 <- unlist(tmp3) #
tmp4 <- tmp4[tmp4 != ""] #
token.list <- sort(tmp4) #
type.list <- unique(token.list) #
token <- length(token.list) #
type <- length(type.list) #
TTR <- type/token #
GI <- type/sqrt(token)
NoS <- length(tmp1)
ASL <- token/NoS
mojiretu <- paste(token.list, collapse="") #
mojisuu <- nchar(mojiretu) #
AWL <- mojisuu/token #
score <- as.integer(score) # scoreを整数に
# 各要素の種類ごとにベクトルを作成
topicV <- c(topicV, topic) # Topicの追加
scoreV <- c(scoreV, score) # Scoreの追加
fileV <- c(fileV, i)
tokenV <- c(tokenV, token)
typeV <- c(typeV, type)
TTRV <- c(TTRV, TTR)
GIV <- c(GIV, GI)
NoSV <- c(NoSV, NoS)
ASLV <- c(ASLV, ASL)
AWLV <- c(AWLV, AWL)
}
data.frame(fileV, topicV, scoreV, tokenV, typeV, TTRV, GIV, NoSV, ASLV, AWLV) # 追加修正
}
setwd("NICER_NNS")
NNS.Index.df <- myIndex.df()
names(NNS.Index.df) <- c("ID", "Topic", "Score", "Token", "Type", "TTR", "GI", "NoS", "ASL", "AWL") # 見出しの名前も変えて
NNS.Index.df$ID <- as.factor(NNS.Index.df$ID)
NNS.Index.df$Topic <- as.factor(NNS.Index.df$Topic)
summary(NNS.Index.df)
## ID Topic Score Token
## JPN501.txt: 1 education:145 Min. :1.000 Min. : 85.0
## JPN502.txt: 1 money : 77 1st Qu.:3.000 1st Qu.:209.0
## JPN503.txt: 1 sports :159 Median :3.000 Median :262.0
## JPN504.txt: 1 Mean :3.522 Mean :275.4
## JPN505.txt: 1 3rd Qu.:4.000 3rd Qu.:323.0
## JPN506.txt: 1 Max. :5.000 Max. :728.0
## (Other) :375 NA's :2
## Type TTR GI NoS
## Min. : 49.0 Min. :0.2531 Min. : 4.566 Min. : 7.00
## 1st Qu.:101.0 1st Qu.:0.4230 1st Qu.: 6.947 1st Qu.:17.00
## Median :122.0 Median :0.4699 Median : 7.502 Median :21.00
## Mean :125.6 Mean :0.4697 Mean : 7.582 Mean :22.07
## 3rd Qu.:146.0 3rd Qu.:0.5141 3rd Qu.: 8.279 3rd Qu.:26.00
## Max. :251.0 Max. :0.6581 Max. :10.443 Max. :51.00
##
## ASL AWL
## Min. : 6.96 Min. :3.507
## 1st Qu.:10.82 1st Qu.:4.163
## Median :12.20 Median :4.395
## Mean :12.70 Mean :4.419
## 3rd Qu.:14.08 3rd Qu.:4.652
## Max. :24.00 Max. :5.415
##
str(NNS.Index.df)
## 'data.frame': 381 obs. of 10 variables:
## $ ID : Factor w/ 381 levels "JPN501.txt","JPN502.txt",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Topic: Factor w/ 3 levels "education","money",..: 3 1 1 3 3 2 1 3 3 1 ...
## $ Score: int 4 4 3 4 4 3 4 3 4 3 ...
## $ Token: int 319 351 201 260 417 260 355 195 260 183 ...
## $ Type : int 134 158 121 139 174 123 149 97 103 99 ...
## $ TTR : num 0.42 0.45 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.43 8.53 8.62 8.52 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## $ ASL : num 10.63 12.1 15.46 9.63 16.68 ...
## $ AWL : num 4.3 4.29 4.75 4.77 4.02 ...
NNS.Index.df2 <- na.omit(NNS.Index.df)
summary(NNS.Index.df2)
## ID Topic Score Token
## JPN501.txt: 1 education:145 Min. :1.000 Min. : 85.0
## JPN502.txt: 1 money : 77 1st Qu.:3.000 1st Qu.:209.0
## JPN503.txt: 1 sports :157 Median :3.000 Median :262.0
## JPN504.txt: 1 Mean :3.522 Mean :275.6
## JPN505.txt: 1 3rd Qu.:4.000 3rd Qu.:322.5
## JPN506.txt: 1 Max. :5.000 Max. :728.0
## (Other) :373
## Type TTR GI NoS
## Min. : 49.0 Min. :0.2531 Min. : 4.566 Min. : 7.00
## 1st Qu.:101.0 1st Qu.:0.4232 1st Qu.: 6.952 1st Qu.:17.00
## Median :122.0 Median :0.4699 Median : 7.503 Median :21.00
## Mean :125.7 Mean :0.4698 Mean : 7.586 Mean :22.08
## 3rd Qu.:146.0 3rd Qu.:0.5137 3rd Qu.: 8.283 3rd Qu.:26.00
## Max. :251.0 Max. :0.6581 Max. :10.443 Max. :51.00
##
## ASL AWL
## Min. : 6.96 Min. :3.507
## 1st Qu.:10.81 1st Qu.:4.163
## Median :12.21 Median :4.395
## Mean :12.71 Mean :4.420
## 3rd Qu.:14.11 3rd Qu.:4.652
## Max. :24.00 Max. :5.415
##
setwd("NICER_NS")
NS.Index.df <- myIndex.df()
names(NS.Index.df) <- c("ID", "Topic", "Score", "Token", "Type", "TTR", "GI", "NoS", "ASL", "AWL") # 見出しの名前も変えて
NS.Index.df$ID <- as.factor(NS.Index.df$ID)
NS.Index.df$Topic <- as.factor(NS.Index.df$Topic)
head(NS.Index.df)
## ID Topic Score Token Type TTR GI NoS ASL AWL
## 1 NS501.txt education 5 736 359 0.4877717 13.23292 39 18.87179 4.592391
## 2 NS502.txt education 6 636 340 0.5345912 13.48188 26 24.46154 5.201258
## 3 NS503.txt education 6 834 353 0.4232614 12.22339 22 37.90909 5.565947
## 4 NS504.txt education 6 824 336 0.4077670 11.70511 30 27.46667 5.276699
## 5 NS505.txt sports 6 898 393 0.4376392 13.11458 39 23.02564 4.749443
## 6 NS506.txt education 6 829 339 0.4089264 11.77396 31 26.74194 4.460796
「すべて」を含む一番一般的なモデル GLMM glmer()
ランダム効果を入れない場合は、GLM glm()
応答変数の分布を確認して、どの分布を使うかを決める
1. 正規分布だったら gaussian (重回帰分析はこれに相当する)
2. 正の連続値だったら Gamma
3. 正の整数だったら poisson
4. 正誤などの二値だったら binomial
正規分布の場合は、family=gaussian でもよいが、glm()の代わりに lm()でよい。
VIFを考えて、入れる要因を選択する
合理的に考えて、ありうる交互作用も要因に入れる
MuMInパッケージで、最適モデルを選ぶ
最適モデルで分析
可視化
妥当性チェック
レポート
library(tidyverse)
library(openxlsx)
library(lme4)
library(lmerTest)
library(MuMIn)
library(effects)
library(ggplot2)
library(easystats)
library(fitdistrplus)
summary(NS.Index.df)
## ID Topic Score Token Type
## NS501.txt: 1 education:47 Min. :4.000 Min. : 451.0 Min. :223.0
## NS502.txt: 1 money :11 1st Qu.:5.000 1st Qu.: 801.5 1st Qu.:335.0
## NS503.txt: 1 sports :13 Median :6.000 Median : 954.0 Median :383.0
## NS504.txt: 1 Mean :5.625 Mean : 977.0 Mean :386.1
## NS505.txt: 1 3rd Qu.:6.000 3rd Qu.:1094.0 3rd Qu.:428.0
## NS506.txt: 1 Max. :6.000 Max. :2330.0 Max. :763.0
## (Other) :65 NA's :31
## TTR GI NoS ASL
## Min. :0.3113 Min. : 9.936 Min. : 22.00 Min. :11.65
## 1st Qu.:0.3630 1st Qu.:11.497 1st Qu.: 34.50 1st Qu.:19.05
## Median :0.4078 Median :12.319 Median : 43.00 Median :21.75
## Mean :0.4072 Mean :12.400 Mean : 45.66 Mean :22.37
## 3rd Qu.:0.4383 3rd Qu.:13.333 3rd Qu.: 50.50 3rd Qu.:24.54
## Max. :0.5346 Max. :15.807 Max. :155.00 Max. :37.91
##
## AWL
## Min. :4.096
## 1st Qu.:4.600
## Median :4.791
## Mean :4.799
## 3rd Qu.:4.981
## Max. :5.633
##
NS.Index.df2 <- na.omit(NS.Index.df)
summary(NS.Index.df2)
## ID Topic Score Token Type
## NS501.txt: 1 education:28 Min. :4.000 Min. :451.0 Min. :223.0
## NS502.txt: 1 money : 4 1st Qu.:5.000 1st Qu.:670.5 1st Qu.:302.5
## NS503.txt: 1 sports : 8 Median :6.000 Median :826.5 Median :339.5
## NS504.txt: 1 Mean :5.625 Mean :786.8 Mean :339.6
## NS505.txt: 1 3rd Qu.:6.000 3rd Qu.:932.0 3rd Qu.:378.5
## NS506.txt: 1 Max. :6.000 Max. :990.0 Max. :470.0
## (Other) :34
## TTR GI NoS ASL
## Min. :0.3330 Min. : 9.936 Min. :22.00 Min. :15.53
## 1st Qu.:0.4062 1st Qu.:11.366 1st Qu.:29.75 1st Qu.:19.21
## Median :0.4324 Median :12.093 Median :37.50 Median :21.71
## Mean :0.4366 Mean :12.110 Mean :36.20 Mean :22.25
## 3rd Qu.:0.4705 3rd Qu.:12.891 3rd Qu.:43.25 3rd Qu.:23.81
## Max. :0.5346 Max. :15.106 Max. :55.00 Max. :37.91
##
## AWL
## Min. :4.167
## 1st Qu.:4.682
## Median :4.910
## Mean :4.892
## 3rd Qu.:5.123
## Max. :5.633
##
hist(NS.Index.df2$Score)
### Cullen and Frey graph
descdist(NS.Index.df2$Score, boot=500)
## summary statistics
## ------
## min: 4 max: 6
## median: 6
## mean: 5.625
## estimated sd: 0.5400617
## estimated skewness: -1.044158
## estimated kurtosis: 3.084303
正規分布とみなせない。
正の整数だけなので、poissonと想定してみる。
model.glm.1.ns <- glm(Score ~ Type * NoS * ASL * AWL, data = NS.Index.df2, family = poisson)
summary(model.glm.1.ns)
##
## Call:
## glm(formula = Score ~ Type * NoS * ASL * AWL, family = poisson,
## data = NS.Index.df2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.936e+01 4.086e+02 0.047 0.962
## Type -2.929e-02 1.126e+00 -0.026 0.979
## NoS -4.803e-01 1.328e+01 -0.036 0.971
## ASL -7.647e-01 1.958e+01 -0.039 0.969
## AWL -3.170e+00 8.004e+01 -0.040 0.968
## Type:NoS 6.962e-04 3.702e-02 0.019 0.985
## Type:ASL 1.305e-03 5.355e-02 0.024 0.981
## NoS:ASL 1.890e-02 6.431e-01 0.029 0.977
## Type:AWL 5.244e-03 2.222e-01 0.024 0.981
## NoS:AWL 7.084e-02 2.635e+00 0.027 0.979
## ASL:AWL 1.263e-01 3.803e+00 0.033 0.974
## Type:NoS:ASL -2.588e-05 1.790e-03 -0.014 0.988
## Type:NoS:AWL -8.201e-05 7.386e-03 -0.011 0.991
## Type:ASL:AWL -2.041e-04 1.047e-02 -0.020 0.984
## NoS:ASL:AWL -2.367e-03 1.269e-01 -0.019 0.985
## Type:NoS:ASL:AWL 1.873e-06 3.547e-04 0.005 0.996
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2.09681 on 39 degrees of freedom
## Residual deviance: 0.31351 on 24 degrees of freedom
## AIC: 175.92
##
## Number of Fisher Scoring iterations: 3
なにも有意にならない。
Cullen and Frey graphによればベータ分布
lme4ではベータ分布は分析できない
NS.Index.df2$Score <- as.factor(NS.Index.df2$Score)
str(NS.Index.df2)
## 'data.frame': 40 obs. of 10 variables:
## $ ID : Factor w/ 71 levels "NS501.txt","NS502.txt",..: 1 2 3 4 5 6 7 8 10 11 ...
## $ Topic: Factor w/ 3 levels "education","money",..: 1 1 1 1 3 1 1 1 2 3 ...
## $ Score: Factor w/ 3 levels "4","5","6": 2 3 3 3 3 3 2 3 3 3 ...
## $ Token: int 736 636 834 824 898 829 597 848 760 886 ...
## $ Type : int 359 340 353 336 393 339 262 332 301 372 ...
## $ TTR : num 0.488 0.535 0.423 0.408 0.438 ...
## $ GI : num 13.2 13.5 12.2 11.7 13.1 ...
## $ NoS : int 39 26 22 30 39 31 27 43 22 45 ...
## $ ASL : num 18.9 24.5 37.9 27.5 23 ...
## $ AWL : num 4.59 5.2 5.57 5.28 4.75 ...
## - attr(*, "na.action")= 'omit' Named int [1:31] 9 15 18 19 20 22 24 25 26 30 ...
## ..- attr(*, "names")= chr [1:31] "9" "15" "18" "19" ...
library(rpart)
DT.model.1 <- rpart(Score ~ Type + NoS + ASL + AWL, data = NS.Index.df2)
summary(DT.model.1)
## Call:
## rpart(formula = Score ~ Type + NoS + ASL + AWL, data = NS.Index.df2)
## n= 40
##
## CP nsplit rel error xerror xstd
## 1 0.6428571 0 1.0000000 1.0000000 0.2154729
## 2 0.0100000 1 0.3571429 0.4285714 0.1613084
##
## Variable importance
## Type NoS ASL AWL
## 53 20 17 10
##
## Node number 1: 40 observations, complexity param=0.6428571
## predicted class=6 expected loss=0.35 P(node) =1
## class counts: 1 13 26
## probabilities: 0.025 0.325 0.650
## left son=2 (16 obs) right son=3 (24 obs)
## Primary splits:
## Type < 330.5 to the left, improve=10.558330, (0 missing)
## ASL < 22.35043 to the left, improve= 6.100000, (0 missing)
## AWL < 4.616982 to the left, improve= 3.412500, (0 missing)
## NoS < 28.5 to the left, improve= 2.527419, (0 missing)
## Surrogate splits:
## NoS < 29.5 to the left, agree=0.750, adj=0.375, (0 split)
## ASL < 19.47488 to the left, agree=0.725, adj=0.312, (0 split)
## AWL < 4.376638 to the left, agree=0.675, adj=0.188, (0 split)
##
## Node number 2: 16 observations
## predicted class=5 expected loss=0.25 P(node) =0.4
## class counts: 1 12 3
## probabilities: 0.062 0.750 0.188
##
## Node number 3: 24 observations
## predicted class=6 expected loss=0.04166667 P(node) =0.6
## class counts: 0 1 23
## probabilities: 0.000 0.042 0.958
DT.model.1$variable.importance
## Type NoS ASL AWL
## 10.558333 3.959375 3.299479 1.979688
install.packages("partykit")
library(rpart)
library(partykit)
## Warning: パッケージ 'partykit' はバージョン 4.3.2 の R の下で造られました
## 要求されたパッケージ grid をロード中です
## 要求されたパッケージ libcoin をロード中です
## Warning: パッケージ 'libcoin' はバージョン 4.3.2 の R の下で造られました
## 要求されたパッケージ mvtnorm をロード中です
##
## 次のパッケージを付け加えます: 'mvtnorm'
## 以下のオブジェクトは 'package:modelbased' からマスクされています:
##
## standardize
## 以下のオブジェクトは 'package:effectsize' からマスクされています:
##
## standardize
## 以下のオブジェクトは 'package:datawizard' からマスクされています:
##
## standardize
plot(as.party(DT.model.1))
aov.model <- aov(Score ~ Topic, data = NNS.Index.df2)
summary(aov.model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Topic 2 12.7 6.352 11.49 1.43e-05 ***
## Residuals 376 207.9 0.553
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(effects)
plot(allEffects(aov.model))
install.packages("emmeans")
library(emmeans)
aov.means <- emmeans(aov.model, specs="Topic")
summary(aov.means)
## Topic emmean SE df lower.CL upper.CL
## education 3.75 0.0617 376 3.63 3.87
## money 3.32 0.0847 376 3.16 3.49
## sports 3.41 0.0593 376 3.29 3.52
##
## Confidence level used: 0.95
pairs(aov.means, adjust="bonferroni")
## contrast estimate SE df t.ratio p.value
## education - money 0.427 0.1048 376 4.073 0.0002
## education - sports 0.344 0.0856 376 4.018 0.0002
## money - sports -0.083 0.1034 376 -0.802 1.0000
##
## P value adjustment: bonferroni method for 3 tests
aov.model.lm <- lm(Score ~ Topic, data = NNS.Index.df2)
summary(aov.model.lm)
##
## Call:
## lm(formula = Score ~ Topic, data = NNS.Index.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4076 -0.4076 -0.3247 0.5924 1.6753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.75172 0.06175 60.761 < 2e-16 ***
## Topicmoney -0.42705 0.10484 -4.073 5.66e-05 ***
## Topicsports -0.34408 0.08564 -4.018 7.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7435 on 376 degrees of freedom
## Multiple R-squared: 0.0576, Adjusted R-squared: 0.05258
## F-statistic: 11.49 on 2 and 376 DF, p-value: 1.434e-05
library(effects)
plot(allEffects(aov.model.lm))
library(emmeans)
aov.means.lm <- emmeans(aov.model.lm, specs="Topic")
summary(aov.means.lm)
## Topic emmean SE df lower.CL upper.CL
## education 3.75 0.0617 376 3.63 3.87
## money 3.32 0.0847 376 3.16 3.49
## sports 3.41 0.0593 376 3.29 3.52
##
## Confidence level used: 0.95
pairs(aov.means.lm, adjust="bonferroni")
## contrast estimate SE df t.ratio p.value
## education - money 0.427 0.1048 376 4.073 0.0002
## education - sports 0.344 0.0856 376 4.018 0.0002
## money - sports -0.083 0.1034 376 -0.802 1.0000
##
## P value adjustment: bonferroni method for 3 tests
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
NNS.Index.df2$Score <- as.numeric(NNS.Index.df2$Score)
ggbetweenstats(NNS.Index.df2,
x=Topic,
y=Score
)
全体では影響する
個別に二組の組み合わせで見ると
* education-money間は差があり
* education-sports間も差があり
* money-sports間は差がない
トピックの違いによる「説明率」は? * Adjusted R-squared: 0.05258
影響がある:相関関係がある
双方の群に影響がある(どちらとも相関関係がある)ので、その影響を「除いて」、差があるか
共分散:二つの変数のばらつき具合(分散)が相関関係にあることを表す
summary(NNS.Index.df2)
## ID Topic Score Token
## JPN501.txt: 1 education:145 Min. :1.000 Min. : 85.0
## JPN502.txt: 1 money : 77 1st Qu.:3.000 1st Qu.:209.0
## JPN503.txt: 1 sports :157 Median :3.000 Median :262.0
## JPN504.txt: 1 Mean :3.522 Mean :275.6
## JPN505.txt: 1 3rd Qu.:4.000 3rd Qu.:322.5
## JPN506.txt: 1 Max. :5.000 Max. :728.0
## (Other) :373
## Type TTR GI NoS
## Min. : 49.0 Min. :0.2531 Min. : 4.566 Min. : 7.00
## 1st Qu.:101.0 1st Qu.:0.4232 1st Qu.: 6.952 1st Qu.:17.00
## Median :122.0 Median :0.4699 Median : 7.503 Median :21.00
## Mean :125.7 Mean :0.4698 Mean : 7.586 Mean :22.08
## 3rd Qu.:146.0 3rd Qu.:0.5137 3rd Qu.: 8.283 3rd Qu.:26.00
## Max. :251.0 Max. :0.6581 Max. :10.443 Max. :51.00
##
## ASL AWL
## Min. : 6.96 Min. :3.507
## 1st Qu.:10.81 1st Qu.:4.163
## Median :12.21 Median :4.395
## Mean :12.71 Mean :4.420
## 3rd Qu.:14.11 3rd Qu.:4.652
## Max. :24.00 Max. :5.415
##
NNS.Index.df2$Score <- as.numeric(NNS.Index.df2$Score)
lm.model.1 <- lm(Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
summary(lm.model.1)
##
## Call:
## lm(formula = Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36867 -0.26243 -0.00216 0.25720 1.22204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.163820 0.320158 -6.759 5.36e-11 ***
## Type 0.005424 0.001347 4.025 6.89e-05 ***
## NoS 0.063868 0.006150 10.385 < 2e-16 ***
## ASL 0.127821 0.011484 11.130 < 2e-16 ***
## AWL 0.445741 0.061970 7.193 3.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3867 on 374 degrees of freedom
## Multiple R-squared: 0.7465, Adjusted R-squared: 0.7437
## F-statistic: 275.3 on 4 and 374 DF, p-value: < 2.2e-16
lm.model.2 <- lm(Score ~ Type + NoS + ASL + AWL + Topic, data = NNS.Index.df2)
summary(lm.model.2)
##
## Call:
## lm(formula = Score ~ Type + NoS + ASL + AWL + Topic, data = NNS.Index.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30627 -0.25182 -0.00646 0.26557 1.25685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.779747 0.401353 -4.434 1.22e-05 ***
## Type 0.005587 0.001351 4.137 4.36e-05 ***
## NoS 0.062949 0.006172 10.200 < 2e-16 ***
## ASL 0.123192 0.011900 10.352 < 2e-16 ***
## AWL 0.383430 0.074178 5.169 3.85e-07 ***
## Topicmoney -0.090632 0.065852 -1.376 0.170
## Topicsports -0.076367 0.051759 -1.475 0.141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3864 on 372 degrees of freedom
## Multiple R-squared: 0.7482, Adjusted R-squared: 0.7442
## F-statistic: 184.2 on 6 and 372 DF, p-value: < 2.2e-16
plot(allEffects(lm.model.2))
Topicは3つ:education, money, sports
最初の educationを基準に、
いずれも有意ではない
Topicをいれても、他の4つの変数は有意
library(emmeans)
lm.model.2.means <- emmeans(lm.model.2, specs="Topic")
summary(lm.model.2.means)
## Topic emmean SE df lower.CL upper.CL
## education 3.57 0.0368 372 3.50 3.64
## money 3.48 0.0483 372 3.39 3.58
## sports 3.50 0.0322 372 3.43 3.56
##
## Confidence level used: 0.95
pairs(lm.model.2.means, adjust="bonferroni")
## contrast estimate SE df t.ratio p.value
## education - money 0.0906 0.0659 372 1.376 0.5087
## education - sports 0.0764 0.0518 372 1.475 0.4228
## money - sports -0.0143 0.0563 372 -0.254 1.0000
##
## P value adjustment: bonferroni method for 3 tests
plot(Effect("Topic", lm.model.2))
plot(Effect(c("AWL", "Topic"), lm.model.2), multiline=T, confint = list(style = "auto"))
* 各条件の線が平行 * 信頼区間が重なってしまっている * ゆえに、3条件の間で差があるとは言えない
plot(Effect(c("NoS", "Topic"), lm.model.2), multiline=T, confint = list(style = "auto"))
plot(Effect(c("ASL", "Topic"), lm.model.2), multiline=T, confint = list(style = "auto"))
plot(Effect(c("Type", "Topic"), lm.model.2), multiline=T, confint = list(style = "auto"))
str(NNS.Index.df2)
## 'data.frame': 379 obs. of 10 variables:
## $ ID : Factor w/ 381 levels "JPN501.txt","JPN502.txt",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Topic: Factor w/ 3 levels "education","money",..: 3 1 1 3 3 2 1 3 3 1 ...
## $ Score: num 4 4 3 4 4 3 4 3 4 3 ...
## $ Token: int 319 351 201 260 417 260 355 195 260 183 ...
## $ Type : int 134 158 121 139 174 123 149 97 103 99 ...
## $ TTR : num 0.42 0.45 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.43 8.53 8.62 8.52 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## $ ASL : num 10.63 12.1 15.46 9.63 16.68 ...
## $ AWL : num 4.3 4.29 4.75 4.77 4.02 ...
## - attr(*, "na.action")= 'omit' Named int [1:2] 83 159
## ..- attr(*, "names")= chr [1:2] "83" "159"
library(rpart)
DT.model.1.jp <- rpart(Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
summary(DT.model.1.jp)
## Call:
## rpart(formula = Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
## n= 379
##
## CP nsplit rel error xerror xstd
## 1 0.48299902 0 1.0000000 1.0028088 0.07295891
## 2 0.07564856 1 0.5170010 0.5362943 0.04617851
## 3 0.07094946 2 0.4413524 0.4990619 0.04343360
## 4 0.01847598 3 0.3704030 0.4383452 0.03602121
## 5 0.01667539 5 0.3334510 0.4445478 0.03662158
## 6 0.01226593 7 0.3001002 0.4379462 0.03628509
## 7 0.01000000 8 0.2878343 0.4200471 0.03577865
##
## Variable importance
## Type NoS ASL AWL
## 63 24 12 2
##
## Node number 1: 379 observations, complexity param=0.482999
## mean=3.522427, MSE=0.5819508
## left son=2 (222 obs) right son=3 (157 obs)
## Primary splits:
## Type < 129.5 to the left, improve=0.48299900, (0 missing)
## NoS < 25.5 to the left, improve=0.25036220, (0 missing)
## ASL < 13.09454 to the left, improve=0.15390530, (0 missing)
## AWL < 4.254328 to the left, improve=0.03295434, (0 missing)
## Surrogate splits:
## NoS < 22.5 to the left, agree=0.763, adj=0.427, (0 split)
## ASL < 13.21062 to the left, agree=0.654, adj=0.166, (0 split)
## AWL < 5.028386 to the left, agree=0.594, adj=0.019, (0 split)
##
## Node number 2: 222 observations, complexity param=0.07094946
## mean=3.076577, MSE=0.3139558
## left son=4 (43 obs) right son=5 (179 obs)
## Primary splits:
## Type < 89.5 to the left, improve=0.22451900, (0 missing)
## NoS < 14.5 to the left, improve=0.10740230, (0 missing)
## ASL < 8.296703 to the left, improve=0.06486236, (0 missing)
## AWL < 4.217775 to the left, improve=0.03259894, (0 missing)
## Surrogate splits:
## NoS < 11.5 to the left, agree=0.829, adj=0.116, (0 split)
## ASL < 8.068966 to the left, agree=0.811, adj=0.023, (0 split)
## AWL < 5.087011 to the right, agree=0.811, adj=0.023, (0 split)
##
## Node number 3: 157 observations, complexity param=0.07564856
## mean=4.152866, MSE=0.2823644
## left son=6 (131 obs) right son=7 (26 obs)
## Primary splits:
## Type < 175.5 to the left, improve=0.37637140, (0 missing)
## ASL < 13.07952 to the left, improve=0.18487580, (0 missing)
## NoS < 25.5 to the left, improve=0.10668600, (0 missing)
## AWL < 4.731331 to the left, improve=0.06187831, (0 missing)
## Surrogate splits:
## NoS < 39.5 to the left, agree=0.841, adj=0.038, (0 split)
##
## Node number 4: 43 observations, complexity param=0.01226593
## mean=2.534884, MSE=0.3418064
## left son=8 (13 obs) right son=9 (30 obs)
## Primary splits:
## Type < 77.5 to the left, improve=0.1840677, (0 missing)
## ASL < 11.77381 to the left, improve=0.1788112, (0 missing)
## AWL < 4.384465 to the left, improve=0.1403550, (0 missing)
## NoS < 17.5 to the left, improve=0.1230661, (0 missing)
## Surrogate splits:
## ASL < 9.075 to the left, agree=0.791, adj=0.308, (0 split)
## NoS < 8.5 to the left, agree=0.744, adj=0.154, (0 split)
## AWL < 3.98354 to the left, agree=0.744, adj=0.154, (0 split)
##
## Node number 5: 179 observations, complexity param=0.01847598
## mean=3.206704, MSE=0.2198433
## left son=10 (68 obs) right son=11 (111 obs)
## Primary splits:
## Type < 104.5 to the left, improve=0.08758942, (0 missing)
## ASL < 15.1102 to the left, improve=0.08619874, (0 missing)
## AWL < 4.760836 to the left, improve=0.05131998, (0 missing)
## NoS < 25.5 to the left, improve=0.04374676, (0 missing)
## Surrogate splits:
## NoS < 18.5 to the left, agree=0.687, adj=0.176, (0 split)
## ASL < 9.449405 to the left, agree=0.642, adj=0.059, (0 split)
## AWL < 3.686744 to the left, agree=0.631, adj=0.029, (0 split)
##
## Node number 6: 131 observations, complexity param=0.01667539
## mean=4.007634, MSE=0.1907814
## left son=12 (78 obs) right son=13 (53 obs)
## Primary splits:
## ASL < 13.59987 to the left, improve=0.11674050, (0 missing)
## Type < 163.5 to the left, improve=0.10202770, (0 missing)
## AWL < 4.731331 to the left, improve=0.06906154, (0 missing)
## NoS < 22.5 to the left, improve=0.05496745, (0 missing)
## Surrogate splits:
## NoS < 22.5 to the right, agree=0.748, adj=0.377, (0 split)
## AWL < 4.71853 to the left, agree=0.656, adj=0.151, (0 split)
## Type < 168.5 to the left, agree=0.626, adj=0.075, (0 split)
##
## Node number 7: 26 observations
## mean=4.884615, MSE=0.102071
##
## Node number 8: 13 observations
## mean=2.153846, MSE=0.4378698
##
## Node number 9: 30 observations
## mean=2.7, MSE=0.21
##
## Node number 10: 68 observations
## mean=3.029412, MSE=0.1461938
##
## Node number 11: 111 observations, complexity param=0.01847598
## mean=3.315315, MSE=0.2339096
## left son=22 (97 obs) right son=23 (14 obs)
## Primary splits:
## ASL < 15.5522 to the left, improve=0.18114670, (0 missing)
## AWL < 4.487291 to the left, improve=0.07590572, (0 missing)
## NoS < 25.5 to the left, improve=0.03543986, (0 missing)
## Type < 120.5 to the left, improve=0.02931992, (0 missing)
## Surrogate splits:
## NoS < 15.5 to the right, agree=0.910, adj=0.286, (0 split)
## AWL < 5.162663 to the left, agree=0.892, adj=0.143, (0 split)
##
## Node number 12: 78 observations
## mean=3.884615, MSE=0.1533531
##
## Node number 13: 53 observations, complexity param=0.01667539
## mean=4.188679, MSE=0.1908152
## left son=26 (45 obs) right son=27 (8 obs)
## Primary splits:
## NoS < 26.5 to the left, improve=0.43885260, (0 missing)
## Type < 163 to the left, improve=0.17471090, (0 missing)
## AWL < 4.661196 to the left, improve=0.09461497, (0 missing)
## ASL < 15.1835 to the left, improve=0.01363022, (0 missing)
##
## Node number 22: 97 observations
## mean=3.237113, MSE=0.2015092
##
## Node number 23: 14 observations
## mean=3.857143, MSE=0.122449
##
## Node number 26: 45 observations
## mean=4.066667, MSE=0.1066667
##
## Node number 27: 8 observations
## mean=4.875, MSE=0.109375
library(rpart)
library(partykit)
plot(as.party(DT.model.1.jp))
str(NNS.Index.df2)
## 'data.frame': 379 obs. of 10 variables:
## $ ID : Factor w/ 381 levels "JPN501.txt","JPN502.txt",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Topic: Factor w/ 3 levels "education","money",..: 3 1 1 3 3 2 1 3 3 1 ...
## $ Score: num 4 4 3 4 4 3 4 3 4 3 ...
## $ Token: int 319 351 201 260 417 260 355 195 260 183 ...
## $ Type : int 134 158 121 139 174 123 149 97 103 99 ...
## $ TTR : num 0.42 0.45 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.43 8.53 8.62 8.52 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## $ ASL : num 10.63 12.1 15.46 9.63 16.68 ...
## $ AWL : num 4.3 4.29 4.75 4.77 4.02 ...
## - attr(*, "na.action")= 'omit' Named int [1:2] 83 159
## ..- attr(*, "names")= chr [1:2] "83" "159"
library(rpart)
NNS.Index.df2$Score <- as.factor(NNS.Index.df2$Score)
DT.model.1.jp <- rpart(Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
summary(DT.model.1.jp)
## Call:
## rpart(formula = Score ~ Type + NoS + ASL + AWL, data = NNS.Index.df2)
## n= 379
##
## CP nsplit rel error xerror xstd
## 1 0.46190476 0 1.0000000 1.0000000 0.04608017
## 2 0.09523810 1 0.5380952 0.5523810 0.04272361
## 3 0.02380952 2 0.4428571 0.4619048 0.04045494
## 4 0.01269841 4 0.3952381 0.4380952 0.03974623
## 5 0.01000000 7 0.3571429 0.4285714 0.03944857
##
## Variable importance
## Type NoS ASL AWL
## 60 24 14 2
##
## Node number 1: 379 observations, complexity param=0.4619048
## predicted class=3 expected loss=0.5540897 P(node) =1
## class counts: 2 21 169 151 36
## probabilities: 0.005 0.055 0.446 0.398 0.095
## left son=2 (222 obs) right son=3 (157 obs)
## Primary splits:
## Type < 129.5 to the left, improve=65.715660, (0 missing)
## NoS < 25.5 to the left, improve=25.019590, (0 missing)
## ASL < 13.46694 to the left, improve=12.224380, (0 missing)
## AWL < 5.028386 to the left, improve= 2.839657, (0 missing)
## Surrogate splits:
## NoS < 22.5 to the left, agree=0.763, adj=0.427, (0 split)
## ASL < 13.21062 to the left, agree=0.654, adj=0.166, (0 split)
## AWL < 5.028386 to the left, agree=0.594, adj=0.019, (0 split)
##
## Node number 2: 222 observations, complexity param=0.02380952
## predicted class=3 expected loss=0.2927928 P(node) =0.585752
## class counts: 2 21 157 42 0
## probabilities: 0.009 0.095 0.707 0.189 0.000
## left son=4 (102 obs) right son=5 (120 obs)
## Primary splits:
## Type < 102.5 to the left, improve=6.958214, (0 missing)
## ASL < 15.88095 to the left, improve=5.668513, (0 missing)
## NoS < 14.5 to the left, improve=3.565449, (0 missing)
## AWL < 4.760836 to the left, improve=1.807207, (0 missing)
## Surrogate splits:
## NoS < 18.5 to the left, agree=0.752, adj=0.461, (0 split)
## ASL < 9.449405 to the left, agree=0.586, adj=0.098, (0 split)
## AWL < 4.608173 to the right, agree=0.577, adj=0.078, (0 split)
##
## Node number 3: 157 observations, complexity param=0.0952381
## predicted class=4 expected loss=0.3057325 P(node) =0.414248
## class counts: 0 0 12 109 36
## probabilities: 0.000 0.000 0.076 0.694 0.229
## left son=6 (133 obs) right son=7 (24 obs)
## Primary splits:
## Type < 177.5 to the left, improve=24.125300, (0 missing)
## ASL < 13.07952 to the left, improve= 7.021848, (0 missing)
## NoS < 32.5 to the left, improve= 3.607608, (0 missing)
## AWL < 4.731331 to the left, improve= 3.210214, (0 missing)
## Surrogate splits:
## NoS < 39.5 to the left, agree=0.854, adj=0.042, (0 split)
##
## Node number 4: 102 observations, complexity param=0.01269841
## predicted class=3 expected loss=0.2254902 P(node) =0.2691293
## class counts: 2 18 79 3 0
## probabilities: 0.020 0.176 0.775 0.029 0.000
## left son=8 (43 obs) right son=9 (59 obs)
## Primary splits:
## Type < 89.5 to the left, improve=5.7352670, (0 missing)
## NoS < 14.5 to the left, improve=2.9769090, (0 missing)
## ASL < 11.80625 to the left, improve=1.7113420, (0 missing)
## AWL < 4.384465 to the left, improve=0.8150671, (0 missing)
## Surrogate splits:
## NoS < 14.5 to the left, agree=0.696, adj=0.279, (0 split)
## ASL < 9.371711 to the left, agree=0.618, adj=0.093, (0 split)
## AWL < 4.943758 to the right, agree=0.608, adj=0.070, (0 split)
##
## Node number 5: 120 observations, complexity param=0.02380952
## predicted class=3 expected loss=0.35 P(node) =0.3166227
## class counts: 0 3 78 39 0
## probabilities: 0.000 0.025 0.650 0.325 0.000
## left son=10 (106 obs) right son=11 (14 obs)
## Primary splits:
## ASL < 15.5522 to the left, improve=8.574259, (0 missing)
## AWL < 4.75466 to the left, improve=3.330000, (0 missing)
## Type < 103.5 to the left, improve=1.571492, (0 missing)
## NoS < 25.5 to the left, improve=1.477470, (0 missing)
## Surrogate splits:
## NoS < 15.5 to the right, agree=0.908, adj=0.214, (0 split)
## AWL < 5.162663 to the left, agree=0.900, adj=0.143, (0 split)
##
## Node number 6: 133 observations
## predicted class=4 expected loss=0.1954887 P(node) =0.3509235
## class counts: 0 0 12 107 14
## probabilities: 0.000 0.000 0.090 0.805 0.105
##
## Node number 7: 24 observations
## predicted class=5 expected loss=0.08333333 P(node) =0.06332454
## class counts: 0 0 0 2 22
## probabilities: 0.000 0.000 0.000 0.083 0.917
##
## Node number 8: 43 observations, complexity param=0.01269841
## predicted class=3 expected loss=0.4186047 P(node) =0.1134565
## class counts: 2 16 25 0 0
## probabilities: 0.047 0.372 0.581 0.000 0.000
## left son=16 (23 obs) right son=17 (20 obs)
## Primary splits:
## ASL < 11.77381 to the left, improve=2.975126, (0 missing)
## NoS < 17.5 to the left, improve=2.640827, (0 missing)
## Type < 77.5 to the left, improve=2.126297, (0 missing)
## AWL < 4.384465 to the left, improve=1.949039, (0 missing)
## Surrogate splits:
## NoS < 11.5 to the right, agree=0.698, adj=0.35, (0 split)
## Type < 71 to the left, agree=0.581, adj=0.10, (0 split)
## AWL < 4.490176 to the left, agree=0.581, adj=0.10, (0 split)
##
## Node number 9: 59 observations
## predicted class=3 expected loss=0.08474576 P(node) =0.1556728
## class counts: 0 2 54 3 0
## probabilities: 0.000 0.034 0.915 0.051 0.000
##
## Node number 10: 106 observations
## predicted class=3 expected loss=0.2830189 P(node) =0.2796834
## class counts: 0 3 76 27 0
## probabilities: 0.000 0.028 0.717 0.255 0.000
##
## Node number 11: 14 observations
## predicted class=4 expected loss=0.1428571 P(node) =0.03693931
## class counts: 0 0 2 12 0
## probabilities: 0.000 0.000 0.143 0.857 0.000
##
## Node number 16: 23 observations, complexity param=0.01269841
## predicted class=2 expected loss=0.4782609 P(node) =0.06068602
## class counts: 2 12 9 0 0
## probabilities: 0.087 0.522 0.391 0.000 0.000
## left son=32 (14 obs) right son=33 (9 obs)
## Primary splits:
## NoS < 16.5 to the left, improve=3.646653, (0 missing)
## AWL < 4.384465 to the left, improve=2.982872, (0 missing)
## ASL < 9.371711 to the right, improve=1.998024, (0 missing)
## Type < 78 to the left, improve=1.741891, (0 missing)
## Surrogate splits:
## Type < 82.5 to the left, agree=0.696, adj=0.222, (0 split)
## ASL < 8.826087 to the right, agree=0.652, adj=0.111, (0 split)
##
## Node number 17: 20 observations
## predicted class=3 expected loss=0.2 P(node) =0.05277045
## class counts: 0 4 16 0 0
## probabilities: 0.000 0.200 0.800 0.000 0.000
##
## Node number 32: 14 observations
## predicted class=2 expected loss=0.2857143 P(node) =0.03693931
## class counts: 2 10 2 0 0
## probabilities: 0.143 0.714 0.143 0.000 0.000
##
## Node number 33: 9 observations
## predicted class=3 expected loss=0.2222222 P(node) =0.0237467
## class counts: 0 2 7 0 0
## probabilities: 0.000 0.222 0.778 0.000 0.000
library(rpart)
library(partykit)
plot(as.party(DT.model.1.jp))