{{outline}}
!!!TypeとToken
!!Rのパッケージ corpus を参考に、TypeとTokenの振る舞いを見てみる
http://corpustext.com/articles/corpus.html
!オズの魔法使いのテキストを取ってきて本文だけにする。
 oz.text <- gsub("\\n", " ", text)
 oz.text.nopunct <- gsub("\\W+", " ", oz.text)
 oz.words <- strsplit(oz.text.nopunct, "\\W")
 oz.words <- unlist(oz.words)
 write(oz.words, file="ozWords.txt")
 length(oz.words)
*39,456語
 > head(sort(table(oz.words), decreasing=T), 10)
 oz.words
  the  and   to   of    a    I  was   in  you   he 
 2731 1593 1096  811  795  647  501  463  448  410 

!TypeとTokenの分布を見てみる
*394行2列の行列、0で初期化
 > oztt <- matrix(0, nrow=394, ncol=2)
*100語ずつ累積して39,400語までのTypeとTokenを見てみる。
	i <- 1
	y <- 0
	while (i <= 394) {
		y <- i * 100
		tmp <- oz.words[1:y]
		oztt[i,1] <- length(tmp)
		oztt[i,2] <- length(unique(tmp))
		i <- i+1
	}
*データフレーム化して、見出しをつける
 > oztt <- as.data.frame(oztt)
 > colnames(oztt) <- c("token","type")
 > head(oztt)
   token type
 1   100   63
 2   200  122
 3   300  167
 4   400  209
 5   500  248
 6   600  280

 >plot(oztt$token, oztt$type)
{{ref_image tt.png}}

!!多様性を見てみる
!TTRを追加する
 > oztt$ttr <- oztt$type / oztt$token
 > head(oztt)
   token type       ttr
 1   100   63 0.6300000
 2   200  122 0.6100000
 3   300  167 0.5566667
 4   400  209 0.5225000
 5   500  248 0.4960000
 6   600  280 0.4666667
*tokenの増加に伴うtypeの増加、および、TTRの減少をグラフにプロット
 > plot(oztt$token, oztt$type)
 > par(new=T)
 > plot(oztt$token, oztt$ttr, col = "red")
{{ref_image TTR.png}}

!Guiraud Index（ギロー・インデックス）を追加してみる
 > oztt$gi <- oztt$type / sqrt(oztt$token)
 > par(new=T)
 > plot(oztt$token, oztt$gi, col = "blue")
{{ref_image GI.png}}
*およそ4000語を越えたあたりからほぼ水平で意外と安定している。

!!千語までで見てみる
{{ref_image ogttt.png}}

!!2千語までで見てみる
{{ref_image oztt2t.png}}


!!1万語までで見てみる
{{ref_image oztt10t.png}}