{{outline}}
!!!TypeとToken
!!Rのパッケージ corpus を参考に、TypeとTokenの振る舞いを見てみる
http://corpustext.com/articles/corpus.html
!オズの魔法使いのテキストを取ってきて本文だけにする。
 oz.text <- gsub("\\n", " ", text)
 oz.text.nopunct <- gsub("\\W+", " ", oz.text)
 oz.words <- strsplit(oz.text.nopunct, "\\W")
 oz.words <- unlist(oz.words)
 write(oz.words, file="ozWords.txt")
 length(oz.words)
*39,456語
 > head(sort(table(oz.words), decreasing=T), 10)
 oz.words
  the  and   to   of    a    I  was   in  you   he 
 2731 1593 1096  811  795  647  501  463  448  410 

!TypeとTokenの分布を見てみる
*394行2列の行列、0で初期化
 > oztt <- matrix(0, nrow=394, ncol=2)
*100語ずつ累積して39,400語までのTypeとTokenを見てみる。
	i <- 1
	y <- 0
	while (i <= 394) {
		y <- i * 100
		tmp <- oz.words[1:y]
		oztt[i,1] <- length(tmp)
		oztt[i,2] <- length(unique(tmp))
		i <- i+1
	}
*データフレーム化して、見出しをつける
 > oztt <- as.data.frame(oztt)
 > colnames(oztt) <- c("token","type")
 > head(oztt)
   token type
 1   100   63
 2   200  122
 3   300  167
 4   400  209
 5   500  248
 6   600  280

 >plot(oztt$token, oztt$type)
{{ref_image tt.png}}

!!多様性を見てみる
!TTRを追加する
 > oztt$ttr <- oztt$type / oztt$token
 > head(oztt)
   token type       ttr
 1   100   63 0.6300000
 2   200  122 0.6100000
 3   300  167 0.5566667
 4   400  209 0.5225000
 5   500  248 0.4960000
 6   600  280 0.4666667
*tokenの増加に伴うtypeの増加、および、TTRの減少をグラフにプロット
<<y軸のスケールは無視してグラフが重なるようにプロットしている>>
 > plot(oztt$token, oztt$type)
 > par(new=T)
 > plot(oztt$token, oztt$ttr, col = "red")
{{ref_image TTR.png}}

!Guiraud Index（ギロー・インデックス）を追加してみる
 > oztt$gi <- oztt$type / sqrt(oztt$token)
 > par(new=T)
 > plot(oztt$token, oztt$gi, col = "blue")
{{ref_image GI.png}}
*およそ4000語を越えたあたりからほぼ水平で意外と安定している。

!!1000語までで見てみる

 ozttt <- matrix(0, nrow=1000, ncol=2)
	i <- 1
	while (i <= 1000) {
		tmp <- oz.words[1:i]
		ozttt[i,1] <- length(tmp)
		ozttt[i,2] <- length(unique(tmp))
		i <- i+1
	}
 
 ozttt <- as.data.frame(ozttt)
 colnames(ozttt) <- c("token","type")

 cor.test(ozttt$type, ozttt$token)
 
        Pearson's product-moment correlation
 
 data:  ozttt$type and ozttt$token
 t = 234.34, df = 998, p-value < 2.2e-16
 alternative hypothesis: true correlation is not equal to 0
 95 percent confidence interval:
  0.9898566 0.9920780
 sample estimates:
       cor 
 0.9910356 

 lm(ozttt$type ~ ozttt$token)
 Call:
 lm(formula = ozttt$type ~ ozttt$token)
 
 Coefficients:
 (Intercept)  ozttt$token  
     43.8034       0.3761 

 #plot(ozttt$token, ozttt$type)
 #abline(lm(ozttt$type ~ ozttt$token))
 
 pred1000 <- predict(lm(ozttt$type ~ ozttt$token), interval = "prediction")
 
     結果を保存したpred1000のデータををデータフレーム型に変更  
 pred1000 <- as.data.frame(pred1000)
 
     データをプロット  
 plot(ozttt$token, ozttt$type)
 
     フィット（回帰直線）を黒で描く  
 lines(ozttt$token, pred1000$fit, col = "black")
 
     上限値を赤で描く  
 lines(ozttt$token, pred1000$upr, col = "red")
 
     下限値を青で描く  
 lines(ozttt$token, pred1000$lwr, col = "blue")
{{ref_image tt1000.png}}
{{ref_image oztttgi.png}}

!!2千語までで見てみる
{{ref_image oztt2tgi.png}}


!!1万語までで見てみる
{{ref_image oztt10t.png}}

 oztt10t <- matrix(0, nrow=10000, ncol=2)
	i <- 1
	while (i <= 10000) {
		tmp <- oz.words[1:i]
		oztt10t[i,1] <- length(tmp)
		oztt10t[i,2] <- length(unique(tmp))
		i <- i+1
	}
 
 oztt10t <- as.data.frame(oztt10t)
 colnames(oztt10t) <- c("token","type")
 
 plot(oztt10t$token, oztt10t$type)
 
 oztt10t$ttr <- oztt10t$type / oztt10t$token
 
 par(new=T)
 plot(oztt10t$token, oztt10t$ttr, col = "red")
 
 oztt10t$gi <- oztt10t$type / sqrt(oztt10t$token)
 par(new=T)
 plot(oztt10t$token, oztt10t$gi, col = "blue")