R
!!!relativeFrequency 相対頻度について
{{outline}}

!!サンプルデータ　１００万語
http://www.thegrammarlab.com/?nor-portfolio=1000000-word-sample-corpora
http://micusp.elicorpora.info/
MICUSP Sample
http://www.thegrammarlab.com/?nor-portfolio=1000000-word-sample-corpora#
{{pre
#　サンプルデータを読み込む
install.packages("readtext", dependencies = T)
library(readtext)

sample100 <- readtext(choose.files())

sample100
str(sample100)

install.packages("quanteda")
library(quanteda)


sample100.corpus <- corpus(sample100)

summary(sample100.corpus)

sample100.dfm <- dfm(sample100.corpus)

summary(sample100.dfm)


sample100.tokens <- tokens(sample100.corpus)

#サンプリング

tokens_chunk(sample100.tokens, 100)
}}

{{pre
> sample100.chunks <- tokens_chunk(sample100.tokens, 100)
> summary(sample100.chunks)
                       Length Class  Mode     
sampleMICUSP.txt.1     100    -none- character
sampleMICUSP.txt.2     100    -none- character
sampleMICUSP.txt.3     100    -none- character
sampleMICUSP.txt.4     100    -none- character
sampleMICUSP.txt.5     100    -none- character
sampleMICUSP.txt.6     100    -none- character
sampleMICUSP.txt.7     100    -none- character
sampleMICUSP.txt.8     100    -none- character
sampleMICUSP.txt.9     100    -none- character
sampleMICUSP.txt.10    100    -none- character
sampleMICUSP.txt.11    100    -none- character
sampleMICUSP.txt.12    100    -none- character

> head(sample100.chunks)
tokens from 6 documents.
sampleMICUSP.txt.1 :
  [1] "throughout"     "history"        "plague"         "has"            "been"           "made"           "infamous"       "as"            
  [9] "the"            "ultimate"       "biological"     "killer"         "the"            "word"           "is"             "now"           
 [17] "synonymous"     "with"           "any"            "particularly"   "contagious"     "lethal"         "and"            "uncontrollable"
 [25] "epidemic"       "however"        "the"            "true"           "plague"         "caused"         "by"             "the"           
 [33] "bacterium"      "yersinia"       "pestis"         "has"            "largely"        "been"           "ignored"        "in"            
 [41] "recent"         "years"          "many"           "people"         "think"          "of"             "plague"         "as"            
 [49] "an"             "extinct"        "disease"        "of"             "the"            "middle"         "ages"           "a"             
 [57] "horrifying"     "tale"           "from"           "history"        "class"          "that"           "has"            "been"          
 [65] "eliminated"     "from"           "society"        "through"        "time"           "and"            "technology"     "of"            
 [73] "course"         "this"           "is"             "not"            "the"            "case"           "although"       "plague"        
 [81] "like"           "many"           "other"          "diseases"       "has"            "been"           "eliminated"     "from"          
 [89] "industrial"     "countries"      "it"             "continues"      "to"             "afflict"        "most"           "parts"         
 [97] "of"             "the"            "world"          "in"            

sampleMICUSP.txt.2 :
  [1] "fact"         "just"         "last"         "month"        "pneumonic"    "plague"       "killed"       "over"         "people"      
 [10] "in"           "a"            "recently"     "reopened"     "diamond"      "mine"         "in"           "the"          "northeastern"
 [19] "region"       "of"           "the"          "democratic"   "republic"     "of"           "the"          "congo"        "the"         


> sample100.chunks[1]
tokens from 1 document.
sampleMICUSP.txt.1 :
  [1] "throughout"     "history"        "plague"         "has"            "been"           "made"           "infamous"       "as"            
  [9] "the"            "ultimate"       "biological"     "killer"         "the"            "word"           "is"             "now"           
 [17] "synonymous"     "with"           "any"            "particularly"   "contagious"     "lethal"         "and"            "uncontrollable"
 [25] "epidemic"       "however"        "the"            "true"           "plague"         "caused"         "by"             "the"           
 [33] "bacterium"      "yersinia"       "pestis"         "has"            "largely"        "been"           "ignored"        "in"            
 [41] "recent"         "years"          "many"           "people"         "think"          "of"             "plague"         "as"            
 [49] "an"             "extinct"        "disease"        "of"             "the"            "middle"         "ages"           "a"             
 [57] "horrifying"     "tale"           "from"           "history"        "class"          "that"           "has"            "been"          
 [65] "eliminated"     "from"           "society"        "through"        "time"           "and"            "technology"     "of"            
 [73] "course"         "this"           "is"             "not"            "the"            "case"           "although"       "plague"        
 [81] "like"           "many"           "other"          "diseases"       "has"            "been"           "eliminated"     "from"          
 [89] "industrial"     "countries"      "it"             "continues"      "to"             "afflict"        "most"           "parts"         
 [97] "of"             "the"            "world"          "in"            

> sample100.chunks[99]
tokens from 1 document.
sampleMICUSP.txt.99 :
  [1] "evolve"      "when"        "males"       "can"         "control"     "access"      "to"          "females"     "or"          "resources"  
 [11] "the"         "females"     "need"        "the"         "largest"     "males"       "can"         "control"     "access"      "to"         
 [21] "females"     "resulting"   "in"          "several"     "females"     "mating"      "with"        "one"         "male"        "because"    
 [31] "the"         "male"        "can"         "directly"    "control"     "the"         "female"      "she"         "will"        "not"        
 [41] "mate"        "with"        "many"        "other"       "males"       "there"       "is"          "less"        "selective"   "pressure"   
 [51] "for"         "more"        "competitive" "sperm"       "so"          "it"          "is"          "expected"    "that"        "the"        
 [61] "eastern"     "testis"      "epididymis"  "would"       "be"          "smaller"     "than"        "the"         "whitebelly"  "however"    
 [71] "because"     "the"         "dolphins"    "were"        "not"         "studied"     "directly"    "it"          "is"          "unknown"    
 [81] "how"         "exactly"     "the"         "mating"      "systems"     "work"        "and"         "if"          "the"         "assignment" 
 [91] "of"          "a"           "promiscuous" "mating"      "system"      "to"          "the"         "whitebelly"  "form"        "and"        

}}
!!全体の高頻度語
{{pre
> topfeatures(sample100.dfm)
  the    of    to   and    in     a    is  that   for    as 
68659 38144 28852 28657 22834 19779 15292 14552  9630  9208 

> topfeatures(sample100.dfm, 100)
      the        of        to       and        in         a        is      that       for        as      this        be       are      with 
    68659     38144     28852     28657     22834     19779     15292     14552      9630      9208      7612      7154      6860      6814 
        s        it        on       not        by       was      from        an     their      have        or     which        at         i 
     6396      6227      5998      5401      5374      4712      4497      4194      3932      3820      3804      3415      3314      3231 
     they       his      were        we       can      more       one     these      will        he       has       but      also     would 
     3171      3123      2953      2898      2866      2837      2833      2817      2802      2645      2600      2541      2425      2424 
    other       all     there       her   between      when      than        if       two      only   because     about      such       may 
     2067      2056      1960      1867      1863      1832      1795      1775      1695      1675      1666      1634      1584      1534 
  however      time      been      what       its      into       who        so  students      each      both     first    social      most 
     1515      1505      1478      1475      1467      1465      1407      1384      1379      1367      1355      1345      1340      1328 
      how       had     while      some        do    people       she   through      used        no     could      does      many      them 
     1308      1300      1258      1258      1248      1225      1220      1206      1202      1198      1185      1179      1170      1167 
different      then       our        my     women       use       out     state     being      well      data     study    system    should 
     1145      1083      1077      1049      1023      1016      1015      1000       985       973       971       961       943       938 
     even     where 
      930       906 
}}
!!1，１０，１００，１０００位の語と頻度
{{pre
> top1000 <- topfeatures(sample100.dfm, 1000)

> top1000[1]
  the 
68659 
> top1000[10]
  as 
9208 
> top1000[100]
where 
  906 
> top1000[1000]
sites 
  125 
}}
{{ref_image top1000.png}}
{{ref_image logtop1000.png}}
!topfeatures()の結果の中身
{{pre
> summary(top1000)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  125.0   162.0   233.0   707.9   393.2 68659.0 
> str(top1000)
 Named num [1:1000] 68659 38144 28852 28657 22834 ...
 - attr(*, "names")= chr [1:1000] "the" "of" "to" "and" ...
> skim(top1000)
─ Data Summary ────────────
                           Values 
Name                       top1000
Number of rows             1000   
Number of columns          1      
_______________________           
Column type frequency:            
  numeric                  1      
________________________          
Group variables            None   

─ Variable type: numeric ───────────────────────────
}}
*names属性のついた数値ベクトル
**要素番号のように、要素を指定すると、その数値が表示される。
{{pre
> top1000["the"]
  the 
68659 
> top1000["book"]
book 
 244 
> top1000["there"]
there 
 1960 
> top1000["therefore"]
therefore 
      528 
}}
*要素番号の指定もできる。（この場合、要素番号が頻度順位となる）
{{pre
> top1000[300]
every 
  344 
> top1000[200]
individuals 
        468 
> top1000[150]
analysis 
     607 
> top1000[170]
part 
 540 
> top1000[168]
just 
 546 
> top1000[160]
although 
     568 
> top1000[180]
see 
507 
> top1000[175]
therefore 
      528 
}}
!!チャンク内の検索と頻度
{{pre
> sample100.chunks[1]
tokens from 1 document.
sampleMICUSP.txt.1 :
  [1] "throughout"     "history"        "plague"         "has"            "been"           "made"           "infamous"       "as"            
  [9] "the"            "ultimate"       "biological"     "killer"         "the"            "word"           "is"             "now"           
 [17] "synonymous"     "with"           "any"            "particularly"   "contagious"     "lethal"         "and"            "uncontrollable"
 [25] "epidemic"       "however"        "the"            "true"           "plague"         "caused"         "by"             "the"           
 [33] "bacterium"      "yersinia"       "pestis"         "has"            "largely"        "been"           "ignored"        "in"            
 [41] "recent"         "years"          "many"           "people"         "think"          "of"             "plague"         "as"            
 [49] "an"             "extinct"        "disease"        "of"             "the"            "middle"         "ages"           "a"             
 [57] "horrifying"     "tale"           "from"           "history"        "class"          "that"           "has"            "been"          
 [65] "eliminated"     "from"           "society"        "through"        "time"           "and"            "technology"     "of"            
 [73] "course"         "this"           "is"             "not"            "the"            "case"           "although"       "plague"        
 [81] "like"           "many"           "other"          "diseases"       "has"            "been"           "eliminated"     "from"          
 [89] "industrial"     "countries"      "it"             "continues"      "to"             "afflict"        "most"           "parts"         
 [97] "of"             "the"            "world"          "in"            

> grep("\\bthe\\b", sample100.chunks[1])
1]  9 13 27 32 53 77 98
> length(grep("\\bthe\\b", sample100.chunks[1]))
[1] 7
}}
!"the"の頻度分布
{{pre
x <- 0
for (i in 1:10000){
  y <- length(grep("\\bthe\\b", sample100.chunks[i]))
  x <- append(x, y)
}
x
}}

{{pre
> x
   [1]  0  7  7  9 10  8  8 10  6  6  8  7  6  5  8  3 13  6  8  8  5  4  4  6  7  8  3  6  9  5  6  6  6  5  8  3 10  8 11  8  6  4  8 11 11  5  3
  [48]  2  4  4  4  6  6  4  9  5  5  2  2  2  4 10  9 12 11  3  3  5  3  0 10  9  7  6  2  5  5  3  9  4  6  5  3  9 14  4  7  2  1  6  4  2  5  7
  [95]  6  6  6 14 12 10  8  5  5  4  6  3  4  4  5 12 11 11  6 12  8 11 11  4  8  8  6  4  8  5  4  5  6  6 11  8  9 10  8  7  3  8  4  5  2  7  8
 [142]  8 11  3  9 10  4 12  4  9  5  9  9  9  9 10  5  3  9  9  9 11  5  8 12 12  8  7 10  4  6  4  3  6  2  4  8  6  4  5  7  7  9  4 10  7  9  6
 [189]  5 10  7 11 10  9  9  9 11 10 13  6  8  6  7 11  7 10  8  9  6  9  8 10 11  6  7  4  7  4  7  8  7  8  9 11 13 11 12  8 13 12 10  5 10 11  9
 [236]  7  4  1  3  3  4  1  2  5  4  5  2  3  3  9  7  4 14 11  5 12 11 14 11 12  6 13 13  7 13 12  5  5 11  9  8 14 10 10  6 14  6  6  5  6  4  4
 
> plot(x)

> hist(x)

> barplot(x)

> barplot(sort(x))
}}
{{ref_image plotthe.png}}
{{ref_image boxplotthe.png}}
{{ref_image histthe.png}}
{{ref_image barplotthe.png}}
{{ref_image barplotthesorted.png}}
!"the"の累積頻度分布
{{pre
z <- 0
y <- 0
rui <- 0
for (i in 1:10000){
  y <- length(grep("\\bthe\\b", sample100.chunks[i]))
  z <- z + y
  rui <- append(rui, z)
}
rui

}}

{{pre
> tail(rui)
[1] 68636 68638 68641 68649 68653 68659

> rui
   [1]    0    7   14   23   33   41   49   59   65   71   79   86   92   97  105  108  121  127  135  143  148  152  156  162  169  177  180  186
  [29]  195  200  206  212  218  223  231  234  244  252  263  271  277  281  289  300  311  316  319  321  325  329  333  339  345  349  358  363
  [57]  368  370  372  374  378  388  397  409  420  423  426  431  434  434  444  453  460  466  468  473  478  481  490  494  500  505  508  517
  [85]  531  535  542  544  545  551  555  557  562  569  575  581  587  601  613  623  631  636  641  645  651  654  658  662  667  679  690  701
 [113]  707  719  727  738  749  753  761  769  775  779  787  792  796  801  807  813  824  832  841  851  859  866  869  877  881  886  888  895
 [141]  903  911  922  925  934  944  948  960  964  973  978  987  996 1005 1014 1024 1029 1032 1041 1050 1059 1070 1075 1083 1095 1107 1115 1122
 [169] 1132 1136 1142 1146 1149 1155 1157 1161 1169 1175 1179 1184 1191 1198 1207 1211 1221 1228 1237 1243 1248 1258 1265 1276 1286 1295 1304 1313


plot(rui)
}}
{{ref_image plotrui.png}}

!１００語ずつ１００万語まで１万回サンプリングしてみるとほぼ直線的に増えていることがわかる。

!最初の１００語での出現頻度は、７回。これを１万倍したら70000。実際は、68659回。1.02の精度。

!!１０００位の sites について
!sitesの頻度分布
{{pre
sx <- 0
for (i in 1:10000){
  y <- length(grep("\\bsites\\b", sample100.chunks[i]))
  sx <- append(sx, y)
}
sx

> sx
   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  [72] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [143] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0
 [214] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
 [285] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 5 0 1 3 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [356] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4 1 2 2 0 0 2 6 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [427] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 
plot(sx)
hist(sx)
barplot(sx)
barplot(sort(sx))
}}
{{ref_image plotsx.png}}
{{ref_image histsx.png}}
{{ref_image barplotsx.png}}
{{ref_image barplotsxsorted.png}}
!siteの累積頻度分布
{{pre
z <- 0
y <- 0
rui_sites <- 0
for (i in 1:10000){
  y <- length(grep("\\bsites\\b", sample100.chunks[i]))
  z <- z + y
  rui_sites <- append(rui_sites, z)
}
rui_sites
plot(rui_sites)

tail(rui_sites)
[1] 125 125 125 125 125 125

}}
{{ref_image rui_sites.png}}

!!分散を見てみる
{{pre
textplot_xray(kwic(sample100.corpus, pattern = "the"))

textplot_xray(kwic(sample100.corpus, pattern = "sites"))
}}
{{ref_image dispersion_the.png}}
{{ref_image dispersion_sites.png}}
!theはどこにおいても均一に出現している
!sitesの出現は、偏っている。
!ゆえに、sitesについては、その一部に基づき、相対頻度を出すことは、結果がゆがむ恐れが大きい。

!!thereforeについて
{{pre
therefore <- 0
for (i in 1:10000){
  y <- length(grep("\\btherefore\\b", sample100.chunks[i]))
  therefore <- append(therefore, y)
}
barplot(therefore)
hist(therefore)
}}

{{ref_image barplottherefore.png}}
{{ref_image histtherefore.png}}

!累積頻度
{{pre
z <- 0
y <- 0
rui_therefore <- 0
for (i in 1:10000){
  y <- length(grep("\\btherefore\\b", sample100.chunks[i]))
  z <- z + y
  rui_therefore <- append(rui_therefore, z)
}
rui_therefore

   [1]  0  0  0  1  1  2  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  5
  [48]  5  5  5  5  5  5  5  5  5  5  6  6  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9  9
  [95]  9  9  9  9 10 10 11 12 12 13 13 13 13 13 14 14 14 14 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15
 [142] 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 17 18 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21
 [189] 21 22 22 23 24 24 25 27 27 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 [236] 28 28 28 28 28 28 28 28 28 28 28 28 28 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29
 [283] 29 29 29 29 29 29 29 29 29 29 29 29 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
 [330] 30 30 30 30 30 30 30 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 32 32 32 33 33 33 33 33 33 33 33 33 33 33 34 34 34
 [

plot(rui_therefore)

tail(rui_therefore)
[1] 528 528 528 528 528 528

}}
{{ref_image plotruitherefore.png}}