R
R
!!!quanteda: Quantitative Analysis of Textual Data
https://rdrr.io/cran/quanteda/

{{pre
install.packages("quanteda", dependencies=T)
library(quanteda)
}}

*コーパスを構築してコーパスデータとして扱う
**tmpにテキストファイルが入っているとして
 tmp.cor <- corpus(tmp)
*コーパスデータの概要を見る
 summary(tmp.cor)
*コーパスデータの中身を見る
 texts(tmp.cor)
*KWIC検索をする
 kwick(tmp.cor, pattern = "文字列")
*トークンのリスト作成
 tokens(tmp)
**オプション
 remove_numbers=T
**オプション
 remove_punct=T


----
{{pre
> tmp.cor <- corpus(tmp)
> tmp.cor
Corpus consisting of 1 document and 0 docvars.
> summary(tmp.cor)
Corpus consisting of 1 document:

  Text Types Tokens Sentences
 text1   120    233         9

Source: C:/Users/ /Documents/* on x86-64 by sugiura
Created: Thu Nov 14 14:40:11 2019
Notes: 
> texts(tmp.cor)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            text1 
"Computer Terminal Systems Inc said\nit has completed the sale of 200,000 shares of its 
（中略）
 generated labels, forms,\ntags and ticket printers and terminals.\n Reuter" 

> kwic(tmp.cor, pattern="for")
                                                                                   
  [text1, 39]             > of Lugano, Switzerland | for | 50,000 dlrs. The company
  [text1, 50]    said the warrants are exercisable | for | five years at a purchase
 [text1, 171]                     of Houston, Tex. | for | 200,000 dlrs. But,      
 [text1, 191] worldwide licensee of the technology | for | Woodco. The company said

> tokens(tmp)
tokens from 1 document.
text1 :
  [1] "Computer"       "Terminal"       "Systems"        "Inc"            "said"           "it"            
  [7] "has"            "completed"      "the"            "sale"           "of"             "200,000"       
 [13] "shares"         "of"             "its"            "common"         "stock"          ","             
 [19] "and"            "warrants"       "to"             "acquire"        "an"             "additional"   

> tokens(tmp, remove_numbers=T, remove_punct=T)
tokens from 1 document.
text1 :
  [1] "Computer"       "Terminal"       "Systems"        "Inc"            "said"           "it"            
  [7] "has"            "completed"      "the"            "sale"           "of"             "shares"        
 [13] "of"             "its"            "common"         "stock"          "and"            "warrants"      
 [19] "to"             "acquire"        "an"             "additional"     "one"            "mln"           
 [25] "shares"         "to"             "Sedio"          "N.V"            "of"             "Lugano"    

}}