R !!!relativeFrequency 相対頻度について {{outline}} !!サンプルデータ 100万語 http://www.thegrammarlab.com/?nor-portfolio=1000000-word-sample-corpora http://micusp.elicorpora.info/ MICUSP Sample http://www.thegrammarlab.com/?nor-portfolio=1000000-word-sample-corpora# {{pre # サンプルデータを読み込む install.packages("readtext", dependencies = T) library(readtext) sample100 <- readtext(choose.files()) sample100 str(sample100) install.packages("quanteda") library(quanteda) sample100.corpus <- corpus(sample100) summary(sample100.corpus) sample100.dfm <- dfm(sample100.corpus) summary(sample100.dfm) sample100.tokens <- tokens(sample100.corpus) #サンプリング tokens_chunk(sample100.tokens, 100) }} {{pre > sample100.chunks <- tokens_chunk(sample100.tokens, 100) > summary(sample100.chunks) Length Class Mode sampleMICUSP.txt.1 100 -none- character sampleMICUSP.txt.2 100 -none- character sampleMICUSP.txt.3 100 -none- character sampleMICUSP.txt.4 100 -none- character sampleMICUSP.txt.5 100 -none- character sampleMICUSP.txt.6 100 -none- character sampleMICUSP.txt.7 100 -none- character sampleMICUSP.txt.8 100 -none- character sampleMICUSP.txt.9 100 -none- character sampleMICUSP.txt.10 100 -none- character sampleMICUSP.txt.11 100 -none- character sampleMICUSP.txt.12 100 -none- character > head(sample100.chunks) tokens from 6 documents. sampleMICUSP.txt.1 : [1] "throughout" "history" "plague" "has" "been" "made" "infamous" "as" [9] "the" "ultimate" "biological" "killer" "the" "word" "is" "now" [17] "synonymous" "with" "any" "particularly" "contagious" "lethal" "and" "uncontrollable" [25] "epidemic" "however" "the" "true" "plague" "caused" "by" "the" [33] "bacterium" "yersinia" "pestis" "has" "largely" "been" "ignored" "in" [41] "recent" "years" "many" "people" "think" "of" "plague" "as" [49] "an" "extinct" "disease" "of" "the" "middle" "ages" "a" [57] "horrifying" "tale" "from" "history" "class" "that" "has" "been" [65] "eliminated" "from" "society" "through" "time" "and" "technology" "of" [73] "course" "this" "is" "not" "the" "case" "although" "plague" [81] "like" "many" "other" "diseases" "has" "been" "eliminated" "from" [89] "industrial" "countries" "it" "continues" "to" "afflict" "most" "parts" [97] "of" "the" "world" "in" sampleMICUSP.txt.2 : [1] "fact" "just" "last" "month" "pneumonic" "plague" "killed" "over" "people" [10] "in" "a" "recently" "reopened" "diamond" "mine" "in" "the" "northeastern" [19] "region" "of" "the" "democratic" "republic" "of" "the" "congo" "the" > sample100.chunks[1] tokens from 1 document. sampleMICUSP.txt.1 : [1] "throughout" "history" "plague" "has" "been" "made" "infamous" "as" [9] "the" "ultimate" "biological" "killer" "the" "word" "is" "now" [17] "synonymous" "with" "any" "particularly" "contagious" "lethal" "and" "uncontrollable" [25] "epidemic" "however" "the" "true" "plague" "caused" "by" "the" [33] "bacterium" "yersinia" "pestis" "has" "largely" "been" "ignored" "in" [41] "recent" "years" "many" "people" "think" "of" "plague" "as" [49] "an" "extinct" "disease" "of" "the" "middle" "ages" "a" [57] "horrifying" "tale" "from" "history" "class" "that" "has" "been" [65] "eliminated" "from" "society" "through" "time" "and" "technology" "of" [73] "course" "this" "is" "not" "the" "case" "although" "plague" [81] "like" "many" "other" "diseases" "has" "been" "eliminated" "from" [89] "industrial" "countries" "it" "continues" "to" "afflict" "most" "parts" [97] "of" "the" "world" "in" > sample100.chunks[99] tokens from 1 document. sampleMICUSP.txt.99 : [1] "evolve" "when" "males" "can" "control" "access" "to" "females" "or" "resources" [11] "the" "females" "need" "the" "largest" "males" "can" "control" "access" "to" [21] "females" "resulting" "in" "several" "females" "mating" "with" "one" "male" "because" [31] "the" "male" "can" "directly" "control" "the" "female" "she" "will" "not" [41] "mate" "with" "many" "other" "males" "there" "is" "less" "selective" "pressure" [51] "for" "more" "competitive" "sperm" "so" "it" "is" "expected" "that" "the" [61] "eastern" "testis" "epididymis" "would" "be" "smaller" "than" "the" "whitebelly" "however" [71] "because" "the" "dolphins" "were" "not" "studied" "directly" "it" "is" "unknown" [81] "how" "exactly" "the" "mating" "systems" "work" "and" "if" "the" "assignment" [91] "of" "a" "promiscuous" "mating" "system" "to" "the" "whitebelly" "form" "and" }} !!全体の高頻度語 {{pre > topfeatures(sample100.dfm) the of to and in a is that for as 68659 38144 28852 28657 22834 19779 15292 14552 9630 9208 > topfeatures(sample100.dfm, 100) the of to and in a is that for as this be are with 68659 38144 28852 28657 22834 19779 15292 14552 9630 9208 7612 7154 6860 6814 s it on not by was from an their have or which at i 6396 6227 5998 5401 5374 4712 4497 4194 3932 3820 3804 3415 3314 3231 they his were we can more one these will he has but also would 3171 3123 2953 2898 2866 2837 2833 2817 2802 2645 2600 2541 2425 2424 other all there her between when than if two only because about such may 2067 2056 1960 1867 1863 1832 1795 1775 1695 1675 1666 1634 1584 1534 however time been what its into who so students each both first social most 1515 1505 1478 1475 1467 1465 1407 1384 1379 1367 1355 1345 1340 1328 how had while some do people she through used no could does many them 1308 1300 1258 1258 1248 1225 1220 1206 1202 1198 1185 1179 1170 1167 different then our my women use out state being well data study system should 1145 1083 1077 1049 1023 1016 1015 1000 985 973 971 961 943 938 even where 930 906 }} !!1,10,100,1000位の語と頻度 {{pre > top1000 <- topfeatures(sample100.dfm, 1000) > top1000[1] the 68659 > top1000[10] as 9208 > top1000[100] where 906 > top1000[1000] sites 125 }} {{ref_image top1000.png}} {{ref_image logtop1000.png}} !topfeatures()の結果の中身 {{pre > summary(top1000) Min. 1st Qu. Median Mean 3rd Qu. Max. 125.0 162.0 233.0 707.9 393.2 68659.0 > str(top1000) Named num [1:1000] 68659 38144 28852 28657 22834 ... - attr(*, "names")= chr [1:1000] "the" "of" "to" "and" ... > skim(top1000) ─ Data Summary ──────────── Values Name top1000 Number of rows 1000 Number of columns 1 _______________________ Column type frequency: numeric 1 ________________________ Group variables None ─ Variable type: numeric ─────────────────────────── }} *names属性のついた数値ベクトル **要素番号のように、要素を指定すると、その数値が表示される。 {{pre > top1000["the"] the 68659 > top1000["book"] book 244 > top1000["there"] there 1960 > top1000["therefore"] therefore 528 }} *要素番号の指定もできる。(この場合、要素番号が頻度順位となる) {{pre > top1000[300] every 344 > top1000[200] individuals 468 > top1000[150] analysis 607 > top1000[170] part 540 > top1000[168] just 546 > top1000[160] although 568 > top1000[180] see 507 > top1000[175] therefore 528 }} !!チャンク内の検索と頻度 {{pre > sample100.chunks[1] tokens from 1 document. sampleMICUSP.txt.1 : [1] "throughout" "history" "plague" "has" "been" "made" "infamous" "as" [9] "the" "ultimate" "biological" "killer" "the" "word" "is" "now" [17] "synonymous" "with" "any" "particularly" "contagious" "lethal" "and" "uncontrollable" [25] "epidemic" "however" "the" "true" "plague" "caused" "by" "the" [33] "bacterium" "yersinia" "pestis" "has" "largely" "been" "ignored" "in" [41] "recent" "years" "many" "people" "think" "of" "plague" "as" [49] "an" "extinct" "disease" "of" "the" "middle" "ages" "a" [57] "horrifying" "tale" "from" "history" "class" "that" "has" "been" [65] "eliminated" "from" "society" "through" "time" "and" "technology" "of" [73] "course" "this" "is" "not" "the" "case" "although" "plague" [81] "like" "many" "other" "diseases" "has" "been" "eliminated" "from" [89] "industrial" "countries" "it" "continues" "to" "afflict" "most" "parts" [97] "of" "the" "world" "in" > grep("\\bthe\\b", sample100.chunks[1]) 1] 9 13 27 32 53 77 98 > length(grep("\\bthe\\b", sample100.chunks[1])) [1] 7 }} !"the"の頻度分布 {{pre x <- 0 for (i in 1:10000){ y <- length(grep("\\bthe\\b", sample100.chunks[i])) x <- append(x, y) } x }} {{pre > x [1] 0 7 7 9 10 8 8 10 6 6 8 7 6 5 8 3 13 6 8 8 5 4 4 6 7 8 3 6 9 5 6 6 6 5 8 3 10 8 11 8 6 4 8 11 11 5 3 [48] 2 4 4 4 6 6 4 9 5 5 2 2 2 4 10 9 12 11 3 3 5 3 0 10 9 7 6 2 5 5 3 9 4 6 5 3 9 14 4 7 2 1 6 4 2 5 7 [95] 6 6 6 14 12 10 8 5 5 4 6 3 4 4 5 12 11 11 6 12 8 11 11 4 8 8 6 4 8 5 4 5 6 6 11 8 9 10 8 7 3 8 4 5 2 7 8 [142] 8 11 3 9 10 4 12 4 9 5 9 9 9 9 10 5 3 9 9 9 11 5 8 12 12 8 7 10 4 6 4 3 6 2 4 8 6 4 5 7 7 9 4 10 7 9 6 [189] 5 10 7 11 10 9 9 9 11 10 13 6 8 6 7 11 7 10 8 9 6 9 8 10 11 6 7 4 7 4 7 8 7 8 9 11 13 11 12 8 13 12 10 5 10 11 9 [236] 7 4 1 3 3 4 1 2 5 4 5 2 3 3 9 7 4 14 11 5 12 11 14 11 12 6 13 13 7 13 12 5 5 11 9 8 14 10 10 6 14 6 6 5 6 4 4 > plot(x) > hist(x) > barplot(x) > barplot(sort(x)) }} {{ref_image plotthe.png}} {{ref_image boxplotthe.png}} {{ref_image histthe.png}} {{ref_image barplotthe.png}} {{ref_image barplotthesorted.png}} !"the"の累積頻度分布 {{pre z <- 0 y <- 0 rui <- 0 for (i in 1:10000){ y <- length(grep("\\bthe\\b", sample100.chunks[i])) z <- z + y rui <- append(rui, z) } rui }} {{pre > tail(rui) [1] 68636 68638 68641 68649 68653 68659 > rui [1] 0 7 14 23 33 41 49 59 65 71 79 86 92 97 105 108 121 127 135 143 148 152 156 162 169 177 180 186 [29] 195 200 206 212 218 223 231 234 244 252 263 271 277 281 289 300 311 316 319 321 325 329 333 339 345 349 358 363 [57] 368 370 372 374 378 388 397 409 420 423 426 431 434 434 444 453 460 466 468 473 478 481 490 494 500 505 508 517 [85] 531 535 542 544 545 551 555 557 562 569 575 581 587 601 613 623 631 636 641 645 651 654 658 662 667 679 690 701 [113] 707 719 727 738 749 753 761 769 775 779 787 792 796 801 807 813 824 832 841 851 859 866 869 877 881 886 888 895 [141] 903 911 922 925 934 944 948 960 964 973 978 987 996 1005 1014 1024 1029 1032 1041 1050 1059 1070 1075 1083 1095 1107 1115 1122 [169] 1132 1136 1142 1146 1149 1155 1157 1161 1169 1175 1179 1184 1191 1198 1207 1211 1221 1228 1237 1243 1248 1258 1265 1276 1286 1295 1304 1313 plot(rui) }} {{ref_image plotrui.png}} !100語ずつ100万語まで1万回サンプリングしてみるとほぼ直線的に増えていることがわかる。 !最初の100語での出現頻度は、7回。これを1万倍したら70000。実際は、68659回。1.02の精度。 !!1000位の sites について !sitesの頻度分布 {{pre sx <- 0 for (i in 1:10000){ y <- length(grep("\\bsites\\b", sample100.chunks[i])) sx <- append(sx, y) } sx > sx [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [72] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [143] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 [214] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 [285] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 5 0 1 3 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [356] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4 1 2 2 0 0 2 6 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [427] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 plot(sx) hist(sx) barplot(sx) barplot(sort(sx)) }} {{ref_image plotsx.png}} {{ref_image histsx.png}} {{ref_image barplotsx.png}} {{ref_image barplotsxsorted.png}} !siteの累積頻度分布 {{pre z <- 0 y <- 0 rui_sites <- 0 for (i in 1:10000){ y <- length(grep("\\bsites\\b", sample100.chunks[i])) z <- z + y rui_sites <- append(rui_sites, z) } rui_sites plot(rui_sites) tail(rui_sites) [1] 125 125 125 125 125 125 }} {{ref_image rui_sites.png}} !!分散を見てみる {{pre textplot_xray(kwic(sample100.corpus, pattern = "the")) textplot_xray(kwic(sample100.corpus, pattern = "sites")) }} {{ref_image dispersion_the.png}} {{ref_image dispersion_sites.png}} !theはどこにおいても均一に出現している !sitesの出現は、偏っている。 !ゆえに、sitesについては、その一部に基づき、相対頻度を出すことは、結果がゆがむ恐れが大きい。 !!thereforeについて {{pre therefore <- 0 for (i in 1:10000){ y <- length(grep("\\btherefore\\b", sample100.chunks[i])) therefore <- append(therefore, y) } barplot(therefore) hist(therefore) }} {{ref_image barplottherefore.png}} {{ref_image histtherefore.png}} !累積頻度 {{pre z <- 0 y <- 0 rui_therefore <- 0 for (i in 1:10000){ y <- length(grep("\\btherefore\\b", sample100.chunks[i])) z <- z + y rui_therefore <- append(rui_therefore, z) } rui_therefore [1] 0 0 0 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 [48] 5 5 5 5 5 5 5 5 5 5 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 [95] 9 9 9 9 10 10 11 12 12 13 13 13 13 13 14 14 14 14 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 [142] 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 17 18 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 [189] 21 22 22 23 24 24 25 27 27 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 [236] 28 28 28 28 28 28 28 28 28 28 28 28 28 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 [283] 29 29 29 29 29 29 29 29 29 29 29 29 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 [330] 30 30 30 30 30 30 30 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 32 32 32 33 33 33 33 33 33 33 33 33 33 33 34 34 34 [ plot(rui_therefore) tail(rui_therefore) [1] 528 528 528 528 528 528 }} {{ref_image plotruitherefore.png}}