R
R.package
!!!ngram
*文字列をn-gramに切り分ける
{{outline}}
*https://cran.r-project.org/web/packages/ngram/index.html
----
!注
*一文一行になってないと、文をまたいでn-gramを生成してしまう。

!データの読み込み
*テキストファイルの読み込み
 readLines()
*特定のフォルダー内の、特定の拡張子のファイルを一度に読み込む
 multiread()
 multiread(パス, extention="拡張子")

!大文字小文字・句読点の前処理 preprocess()
 preprocess(データ, case="lower", remove.punct=T)

!tmパッケージとの関連
*tmパッケージでは、データは、「Corpus object」という特殊なフォーマットのデータフレームになっている
*ngramのプログラムはそのデータを直接扱えない
*以下のような命令で読み込む
 concatenate(lapply(データ , "[", 1) )

!n-gramの作成 ngram(データ, n=グラム数)
*作成されたものは、独自の「ngramオブジェクト」として保存される。

!作成されたn-gramオブジェクト全体の出力 print(オブジェクト, output="full")
*output="truncated" とすると、全部は出ない。

!一覧表形式で出力 get.phrasetable(オブジェクト)
*入れ子式にすると便利
 get.phrasetable(ngram(テキストデータ, n=グラム数))

!n-gram表現を個別にすべて出力 get.ngrams(オブジェクト）
*入れ子式にすると便利
 get.ngrams(ngram(テキストデータ, n=グラム数))


!疑似的文字列の生成 rcorpus(単語数, alphabet=letters[何文字から:何文字まで使って], maxwordlen=最大単語長)
{{pre
> rcorpus(20, alphabet=letters[1:3], maxwordle=4)
[1] "cbac ab a cbca aaa bc ba abaa ab a a caa bba abc cb a abbc a cc abbc"

> rcorpus(100, alphabet=letters[1:26], maxwordle=6)
[1] "zlmxc ohe fy djxz qe jmkfzy uk ovaqw ouc lg hc rdecm ouefue whu vr spwgh pdv vysz it f qo votwt dkhud rraq vc jehrrj yyjlrv on vdffwi gbp uozt o zdxej vxaxm mkir tqhsw iehg mtq tu dgxtr kq p oz xyq ca jxunw zs cmqeqo mg r vkbawi wnza qj phout dnu fcm a qow g zhz dttrvz fi v a ito wah i reh x f jxc mhdme tr uus w iumoy hzi kz qabux zlppn genmyw r iqkw pd majp dtk hf tvfxs kym dgrq ytolxc fycxd ea vkpgg uaxj nb ckcn jnwz xspu oci"
}}
----
!以下古いメモ
{{pre
install.packages("ngram")
library(ngram)
> sample <- "I often hear that death penalty is reasonable for people whose family member or friend is killed by someone."
> get.ngrams(ngram(sample))
 [1] "death penalty"  "is reasonable"  "hear that"      "friend is"      "is killed"      "for people"     "whose family"  
 [8] "by someone."    "often hear"     "people whose"   "that death"     "killed by"      "I often"        "member or"     
[15] "or friend"      "family member"  "reasonable for" "penalty is"    
> 
> get.ngrams(ngram(preprocess(sample, remove.punct=TRUE)))
 [1] "death penalty"  "is reasonable"  "hear that"      "friend is"      "is killed"      "i often"        "for people"    
 [8] "whose family"   "often hear"     "people whose"   "by someone"     "that death"     "killed by"      "member or"     
[15] "or friend"      "family member"  "reasonable for" "penalty is"    
> print(ngram(preprocess(sample, remove.punct=TRUE)), output="full")
death penalty | 1 
is {1} | 

is reasonable | 1 
for {1} | 

hear that | 1 
death {1} | 

friend is | 1 
killed {1} | 

is killed | 1 
by {1} | 

i often | 1 
hear {1} | 

for people | 1 
whose {1} | 

whose family | 1 
member {1} | 

often hear | 1 
that {1} | 

people whose | 1 
family {1} | 

by someone | 1 
NULL {1} | 

that death | 1 
penalty {1} | 

killed by | 1 
someone {1} | 

member or | 1 
friend {1} | 

or friend | 1 
is {1} | 

family member | 1 
or {1} | 

reasonable for | 1 
people {1} | 

penalty is | 1 
reasonable {1} |