all_indexes.df <- read.table("all_indexes.df.txt")
head(all_indexes.df)
ID <chr> | Topic <chr> | Score <int> | Type <int> | Token <int> | TTR <dbl> | GI <dbl> | MATTR <dbl> | AWL <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
1 | JPN501 | sports | 4 | 134 | 638 | 0.4200627 | 7.502560 | 0.4258621 | 4.304075 | |
2 | JPN502 | education | 4 | 0 | 0 | NA | NA | NA | NA | |
3 | JPN503 | education | 3 | 0 | 0 | NA | NA | NA | NA | |
4 | JPN504 | sports | 4 | 160 | 712 | 0.4494382 | 8.479983 | 0.4549438 | 4.233146 | |
5 | JPN505 | sports | 4 | 0 | 0 | NA | NA | NA | NA | |
6 | JPN506 | money | 3 | 0 | 0 | NA | NA | NA | NA |
str(all_indexes.df)
## 'data.frame': 1214 obs. of 12 variables:
## $ ID : chr "JPN501" "JPN502" "JPN503" "JPN504" ...
## $ Topic: chr "sports" "education" "education" "sports" ...
## $ Score: int 4 4 3 4 4 3 4 3 4 3 ...
## $ Type : int 134 0 0 160 0 0 121 0 0 139 ...
## $ Token: int 638 0 0 712 0 0 402 0 0 520 ...
## $ TTR : num 0.42 NA NA 0.449 NA ...
## $ GI : num 7.5 NA NA 8.48 NA ...
## $ MATTR: num 0.426 NA NA 0.455 NA ...
## $ AWL : num 4.3 NA NA 4.23 NA ...
## $ ASL : num 10.6 NA NA 12.3 NA ...
## $ NoS : int 30 0 0 29 0 0 13 0 0 27 ...
## $ Lang : int 2 2 2 2 2 2 2 2 2 2 ...
DT_jp_indexes.df <- subset(all_indexes.df, Lang==2, select=c(Score, Type, Token, TTR, GI, MATTR, AWL, ASL, NoS) )
head(DT_jp_indexes.df)
Score <int> | Type <int> | Token <int> | TTR <dbl> | GI <dbl> | MATTR <dbl> | AWL <dbl> | ASL <dbl> | NoS <int> | |
---|---|---|---|---|---|---|---|---|---|
1 | 4 | 134 | 638 | 0.4200627 | 7.502560 | 0.4258621 | 4.304075 | 10.63333 | 30 |
2 | 4 | 0 | 0 | NA | NA | NA | NA | NA | 0 |
3 | 3 | 0 | 0 | NA | NA | NA | NA | NA | 0 |
4 | 4 | 160 | 712 | 0.4494382 | 8.479983 | 0.4549438 | 4.233146 | 12.27586 | 29 |
5 | 4 | 0 | 0 | NA | NA | NA | NA | NA | 0 |
6 | 3 | 0 | 0 | NA | NA | NA | NA | NA | 0 |
str(DT_jp_indexes.df)
## 'data.frame': 1143 obs. of 9 variables:
## $ Score: int 4 4 3 4 4 3 4 3 4 3 ...
## $ Type : int 134 0 0 160 0 0 121 0 0 139 ...
## $ Token: int 638 0 0 712 0 0 402 0 0 520 ...
## $ TTR : num 0.42 NA NA 0.449 NA ...
## $ GI : num 7.5 NA NA 8.48 NA ...
## $ MATTR: num 0.426 NA NA 0.455 NA ...
## $ AWL : num 4.3 NA NA 4.23 NA ...
## $ ASL : num 10.6 NA NA 12.3 NA ...
## $ NoS : int 30 0 0 29 0 0 13 0 0 27 ...
anyNA(DT_jp_indexes.df)
## [1] TRUE
DT_jp_indexes.df2 <- na.omit(DT_jp_indexes.df)
str(DT_jp_indexes.df2)
## 'data.frame': 381 obs. of 9 variables:
## $ Score: int 4 4 4 3 4 4 4 3 3 4 ...
## $ Type : int 134 160 121 139 175 124 151 98 104 99 ...
## $ Token: int 638 712 402 520 840 522 724 396 526 366 ...
## $ TTR : num 0.42 0.449 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.48 8.53 8.62 8.54 ...
## $ MATTR: num 0.426 0.455 0.606 0.539 0.422 ...
## $ AWL : num 4.3 4.23 4.75 4.77 4 ...
## $ ASL : num 10.63 12.28 15.46 9.63 16.8 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## - attr(*, "na.action")= 'omit' Named int [1:762] 2 3 5 6 8 9 11 12 14 15 ...
## ..- attr(*, "names")= chr [1:762] "2" "3" "5" "6" ...
install.packages("rpart", repos="http://cran.rstudio.com/")
library(rpart)
str(DT_jp_indexes.df2)
## 'data.frame': 381 obs. of 9 variables:
## $ Score: int 4 4 4 3 4 4 4 3 3 4 ...
## $ Type : int 134 160 121 139 175 124 151 98 104 99 ...
## $ Token: int 638 712 402 520 840 522 724 396 526 366 ...
## $ TTR : num 0.42 0.449 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.48 8.53 8.62 8.54 ...
## $ MATTR: num 0.426 0.455 0.606 0.539 0.422 ...
## $ AWL : num 4.3 4.23 4.75 4.77 4 ...
## $ ASL : num 10.63 12.28 15.46 9.63 16.8 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## - attr(*, "na.action")= 'omit' Named int [1:762] 2 3 5 6 8 9 11 12 14 15 ...
## ..- attr(*, "names")= chr [1:762] "2" "3" "5" "6" ...
DT_jp_indexes.df2$Score <- as.factor(DT_jp_indexes.df2$Score)
str(DT_jp_indexes.df2)
## 'data.frame': 381 obs. of 9 variables:
## $ Score: Factor w/ 5 levels "1","2","3","4",..: 4 4 4 3 4 4 4 3 3 4 ...
## $ Type : int 134 160 121 139 175 124 151 98 104 99 ...
## $ Token: int 638 712 402 520 840 522 724 396 526 366 ...
## $ TTR : num 0.42 0.449 0.602 0.535 0.417 ...
## $ GI : num 7.5 8.48 8.53 8.62 8.54 ...
## $ MATTR: num 0.426 0.455 0.606 0.539 0.422 ...
## $ AWL : num 4.3 4.23 4.75 4.77 4 ...
## $ ASL : num 10.63 12.28 15.46 9.63 16.8 ...
## $ NoS : int 30 29 13 27 25 20 26 20 19 14 ...
## - attr(*, "na.action")= 'omit' Named int [1:762] 2 3 5 6 8 9 11 12 14 15 ...
## ..- attr(*, "names")= chr [1:762] "2" "3" "5" "6" ...
DT_result <- rpart(Score ~ ., DT_jp_indexes.df2)
DT_result
## n= 381
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 381 207 3 (0.0079 0.039 0.46 0.46 0.039)
## 2) Token< 594 248 118 3 (0.0081 0.052 0.52 0.38 0.04)
## 4) AWL< 4.632782 189 83 3 (0.011 0.048 0.56 0.33 0.053)
## 8) Type>=69.5 182 77 3 (0.011 0.049 0.58 0.32 0.044) *
## 9) Type< 69.5 7 3 4 (0 0 0.14 0.57 0.29) *
## 5) AWL>=4.632782 59 28 4 (0 0.068 0.41 0.53 0)
## 10) TTR< 0.5087177 27 12 3 (0 0.037 0.56 0.41 0)
## 20) NoS>=15.5 17 5 3 (0 0.059 0.71 0.24 0) *
## 21) NoS< 15.5 10 3 4 (0 0 0.3 0.7 0) *
## 11) TTR>=0.5087177 32 12 4 (0 0.094 0.28 0.62 0) *
## 3) Token>=594 133 52 4 (0.0075 0.015 0.33 0.61 0.038) *
summary(DT_result)
## Call:
## rpart(formula = Score ~ ., data = DT_jp_indexes.df2)
## n= 381
##
## CP nsplit rel error xerror xstd
## 1 0.17874396 0 1.0000000 1.1304348 0.04590223
## 2 0.03381643 1 0.8212560 0.9227053 0.04714772
## 3 0.01932367 2 0.7874396 0.9033816 0.04713992
## 4 0.01449275 4 0.7487923 0.9420290 0.04713472
## 5 0.01000000 5 0.7342995 0.9661836 0.04708922
##
## Variable importance
## Token Type NoS TTR GI MATTR AWL ASL
## 23 18 14 11 11 10 10 4
##
## Node number 1: 381 observations, complexity param=0.178744
## predicted class=3 expected loss=0.5433071 P(node) =1
## class counts: 3 15 174 174 15
## probabilities: 0.008 0.039 0.457 0.457 0.039
## left son=2 (248 obs) right son=3 (133 obs)
## Primary splits:
## Token < 594 to the left, improve=8.099891, (0 missing)
## Type < 147.5 to the left, improve=7.117459, (0 missing)
## GI < 7.410459 to the left, improve=4.771494, (0 missing)
## AWL < 5.09147 to the left, improve=4.142281, (0 missing)
## NoS < 25.5 to the left, improve=3.874786, (0 missing)
## Surrogate splits:
## Type < 135.5 to the left, agree=0.871, adj=0.632, (0 split)
## NoS < 25.5 to the left, agree=0.808, adj=0.451, (0 split)
## TTR < 0.4195045 to the right, agree=0.745, adj=0.271, (0 split)
## MATTR < 0.4253095 to the right, agree=0.745, adj=0.271, (0 split)
## GI < 8.274998 to the left, agree=0.740, adj=0.256, (0 split)
##
## Node number 2: 248 observations, complexity param=0.03381643
## predicted class=3 expected loss=0.4758065 P(node) =0.6509186
## class counts: 2 13 130 93 10
## probabilities: 0.008 0.052 0.524 0.375 0.040
## left son=4 (189 obs) right son=5 (59 obs)
## Primary splits:
## AWL < 4.632782 to the left, improve=2.968262, (0 missing)
## Token < 430 to the right, improve=2.892357, (0 missing)
## GI < 7.34318 to the left, improve=2.243583, (0 missing)
## ASL < 9.693333 to the right, improve=2.108409, (0 missing)
## TTR < 0.4675666 to the left, improve=1.971859, (0 missing)
## Surrogate splits:
## TTR < 0.6114551 to the left, agree=0.778, adj=0.068, (0 split)
## MATTR < 0.6153406 to the left, agree=0.778, adj=0.068, (0 split)
## ASL < 15.42308 to the left, agree=0.770, adj=0.034, (0 split)
##
## Node number 3: 133 observations
## predicted class=4 expected loss=0.3909774 P(node) =0.3490814
## class counts: 1 2 44 81 5
## probabilities: 0.008 0.015 0.331 0.609 0.038
##
## Node number 4: 189 observations, complexity param=0.01449275
## predicted class=3 expected loss=0.4391534 P(node) =0.496063
## class counts: 2 9 106 62 10
## probabilities: 0.011 0.048 0.561 0.328 0.053
## left son=8 (182 obs) right son=9 (7 obs)
## Primary splits:
## Type < 69.5 to the right, improve=2.111925, (0 missing)
## ASL < 9.674815 to the right, improve=1.816707, (0 missing)
## GI < 5.906072 to the right, improve=1.743529, (0 missing)
## Token < 430 to the right, improve=1.293308, (0 missing)
## AWL < 4.528419 to the right, improve=1.279620, (0 missing)
## Surrogate splits:
## GI < 5.872855 to the right, agree=0.989, adj=0.714, (0 split)
## Token < 243 to the right, agree=0.979, adj=0.429, (0 split)
## TTR < 0.3486417 to the right, agree=0.968, adj=0.143, (0 split)
## MATTR < 0.3551553 to the right, agree=0.968, adj=0.143, (0 split)
##
## Node number 5: 59 observations, complexity param=0.01932367
## predicted class=4 expected loss=0.4745763 P(node) =0.1548556
## class counts: 0 4 24 31 0
## probabilities: 0.000 0.068 0.407 0.525 0.000
## left son=10 (27 obs) right son=11 (32 obs)
## Primary splits:
## TTR < 0.5087177 to the left, improve=1.842318, (0 missing)
## MATTR < 0.5136305 to the left, improve=1.842318, (0 missing)
## Type < 86.5 to the left, improve=1.713522, (0 missing)
## GI < 7.237091 to the left, improve=1.440639, (0 missing)
## AWL < 4.707055 to the right, improve=1.347663, (0 missing)
## Surrogate splits:
## MATTR < 0.5136305 to the left, agree=1.000, adj=1.000, (0 split)
## GI < 7.237091 to the left, agree=0.814, adj=0.593, (0 split)
## AWL < 4.864062 to the right, agree=0.661, adj=0.259, (0 split)
## Token < 384 to the right, agree=0.644, adj=0.222, (0 split)
## Type < 112.5 to the left, agree=0.627, adj=0.185, (0 split)
##
## Node number 8: 182 observations
## predicted class=3 expected loss=0.4230769 P(node) =0.4776903
## class counts: 2 9 105 58 8
## probabilities: 0.011 0.049 0.577 0.319 0.044
##
## Node number 9: 7 observations
## predicted class=4 expected loss=0.4285714 P(node) =0.0183727
## class counts: 0 0 1 4 2
## probabilities: 0.000 0.000 0.143 0.571 0.286
##
## Node number 10: 27 observations, complexity param=0.01932367
## predicted class=3 expected loss=0.4444444 P(node) =0.07086614
## class counts: 0 1 15 11 0
## probabilities: 0.000 0.037 0.556 0.407 0.000
## left son=20 (17 obs) right son=21 (10 obs)
## Primary splits:
## NoS < 15.5 to the right, improve=2.4187360, (0 missing)
## ASL < 13.91071 to the left, improve=1.5925930, (0 missing)
## GI < 7.06374 to the right, improve=0.9095118, (0 missing)
## AWL < 4.869654 to the right, improve=0.8481481, (0 missing)
## Token < 497 to the right, improve=0.7270955, (0 missing)
## Surrogate splits:
## ASL < 13.91071 to the left, agree=0.889, adj=0.7, (0 split)
## AWL < 4.989478 to the left, agree=0.778, adj=0.4, (0 split)
## Token < 399 to the right, agree=0.741, adj=0.3, (0 split)
## Type < 81.5 to the right, agree=0.704, adj=0.2, (0 split)
## TTR < 0.4873827 to the left, agree=0.667, adj=0.1, (0 split)
##
## Node number 11: 32 observations
## predicted class=4 expected loss=0.375 P(node) =0.0839895
## class counts: 0 3 9 20 0
## probabilities: 0.000 0.094 0.281 0.625 0.000
##
## Node number 20: 17 observations
## predicted class=3 expected loss=0.2941176 P(node) =0.04461942
## class counts: 0 1 12 4 0
## probabilities: 0.000 0.059 0.706 0.235 0.000
##
## Node number 21: 10 observations
## predicted class=4 expected loss=0.3 P(node) =0.02624672
## class counts: 0 0 3 7 0
## probabilities: 0.000 0.000 0.300 0.700 0.000
library(rpart.plot)
rpart.plot(DT_result)
install.packages("partykit", repos="http://cran.rstudio.com/")
library(partykit)
## 要求されたパッケージ grid をロード中です
## 要求されたパッケージ libcoin をロード中です
## 要求されたパッケージ mvtnorm をロード中です
plot(as.party(DT_result))
頻度に差があるか: chisq:test()
可視化: mozaicplot()
どこに差があるか:残差分析
群馬大学の青木先生のサイト http://aoki2.si.gunma-u.ac.jp/R/ 度数に関する検定 カイ二乗分布を使用する独立性の検定と残差分析
> source("http://aoki2.si.gunma-u.ac.jp/R/src/my-chisq-test.R", encoding="euc-jp")
★my-chisq-test.Rというファイル名だが、関数名はmy.chisq.test()
http://aoki2.si.gunma-u.ac.jp/R/src/G2.R“, encoding=”euc-jp
サンプルサイズに関係ない効果量
2×2の場合
オッヅ比:Fisherの直接確率検定(正確確率検定): fisher.test()
解釈の仕方:「1」が基準=確率に差はない。1より大きければ、分子の方の確率が高い。
> fisher.test(therefore.data)
Fisher's Exact Test for Count Data
data: therefore.data
p-value = 9.958e-06
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.1021974 0.4526589
sample estimates:
odds ratio
0.2196899
> install.packages("lsr", dependencies = T)
> library(lsr)
> cramersV(therefore.data)
[1] 0.3081308
> tmp.data <- c(38, 15, 53, 96)
> cramersV(tmp.data)
[1] 0.3378448
> tmp.data2 <- c(38, 15, 53, 956)
> cramersV(tmp.data2)
[1] 0.8674172