install.packages("quanteda.textmodels", dependencies = T)
install.packages("quanteda.textstats", dependencies = T)
install.packages("quanteda.textplots", dependencies = T)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
install.packages("readtext", dependencies = T, repos="http://cran.rstudio.com/")
タグなどのついていない、テキストデータのみのファイルを、フォルダーにまとめて入れておく。
NICERの学習者データについて、テキストのみのファイルに変換する
https://sugiura-ken.org/wiki/wiki.cgi/exp?page=chatToText
#chatToText
#2020-01-17 sugiura@nagoya-u.jp
#CHATフォーマットのファイルから、
#本文のメインティアのテキストだけを抜き出して
#もとのファイル名に .data を付け足したファイル名で保存する。
chatToText <- function(){
#ディレクトリー内のすべてのテキストファイルを対象に
here <- getwd()
files <- list.files(here, pattern="\\.txt$")
#読み込むファイル名で、.txt という拡張子でファイル名が終わるものを指定
for (i in files){
lines.tmp <- scan(i, what="char", sep="\n")
lines.tmp <- scan(i, what="char", sep="\n")
data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
body.tmp <- body.tmp[body.tmp != ""]
filename <- i
filename <- as.factor(filename)
filename <- paste(filename, ".data", sep="")
#もとのファイル名に .data という文字列を追加
#ファイル名の終わりが .txt ではなくなるので再帰的に読み込まれない
write(body.tmp, file=filename)
}
}
setwd("NICER1_3_2/NICER_NNS")
chatToText()
setwd("NICER1_3_2/NICER_NNS")
list.files()
library(readtext)
setwd("NICER1_3_2/NICER_NNS")
options(warn=-1)
nicerJP.data <- readtext("*.data")
nicerJP.data %>%
slice_sample(n=10)
library(quanteda)
## Package version: 3.2.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
nicerJP.corpus <- corpus(nicerJP.data)
nicerJP.corpus
## Corpus consisting of 381 documents.
## JPN501.txt.data :
## "What kind of sports do you like? Do you like soccer, base ba..."
##
## JPN502.txt.data :
## "Education of "YUTORI" There was the education system that ca..."
##
## JPN503.txt.data :
## "educational policy What do you think about "yutori kyouiku"?..."
##
## JPN504.txt.data :
## "The impact of sports You often play sports. In elementary sc..."
##
## JPN505.txt.data :
## "About sports I want to talk about doing sports. To tell you ..."
##
## JPN506.txt.data :
## "Is money the most important thing? I often hear that which i..."
##
## [ reached max_ndoc ... 375 more documents ]
summary(nicerJP.corpus)
Corpus consisting of 381 documents, showing 100 documents:
Text Types Tokens Sentences
JPN501.txt.data 153 391 29
JPN502.txt.data 171 411 27
JPN503.txt.data 130 223 12
JPN504.txt.data 149 306 26
JPN505.txt.data 187 466 24
JPN506.txt.data 134 292 19
JPN507.txt.data 163 406 25
JPN508.txt.data 110 224 17
JPN509.txt.data 110 301 17
JPN510.txt.data 108 198 13
as.character(nicerJP.corpus)[3]
## JPN503.txt.data
## "educational policy\nWhat do you think about \"yutori kyouiku\"?\nIt is the educational policy started about a decade ago.\nBecause people had been educated based on learning as much knowledge as possible since WWII and they become to luck an ability of thinking by themselves, Japanese government started it to enable Japanese students to study objectively and acquire the ability to live by themselves.\nNowadays, they say this educational policy has much room to improve.\nIt is obvious that as they could learn less knowledge than before, they become not to be able to think and act objectively from the wide and deep prospect.\nSo I am for this critical argument generally.\nBut I think we should not see it as an entirely useless and incorrect policy.\nThen, what should we do?\nI think it is important to analyze what points lead to fail scientifically and acknowledge it precisely.\nFor example, Shortening school time is said to be bad for students because they are playing video games instead.\nIt is partially true, but when it comes to the children from upper society, actually they get more time to study effectively.\nUnless we argue this problem deeply, we will fail to again."
kwic(nicerJP.corpus, "however") %>% head()
行と列で、テキストごとに単語一覧を作成
どの文書にどの単語が入っているかを統一的に操作(分析)できる
対象は、コーパスデータ
オプション
# nicerJP.dfm <- dfm_wordstem(dfm(tokens(nicerJP.corpus, remove_punct = T)))
nicerJP.dfm <- nicerJP.corpus %>%
tokens(remove_punct = T) %>%
tokens_wordstem %>%
#tokens_remove(stopwords("en")) %>%
dfm
nicerJP.dfm
## Document-feature matrix of: 381 documents, 3,224 features (96.30% sparse) and 0 docvars.
## features
## docs what kind of sport do you like soccer base ball
## JPN501.txt.data 1 1 3 6 2 19 2 2 1 1
## JPN502.txt.data 3 0 7 0 1 2 0 0 0 0
## JPN503.txt.data 3 0 1 0 2 1 0 0 1 0
## JPN504.txt.data 0 0 3 17 4 13 0 0 0 0
## JPN505.txt.data 0 0 8 17 13 5 2 1 0 0
## JPN506.txt.data 1 0 3 0 3 14 0 0 0 0
## [ reached max_ndoc ... 375 more documents, reached max_nfeat ... 3,214 more features ]
summary(nicerJP.dfm)
## Length Class Mode
## 1228344 dfm S4
#View(nicerJP.dfm)
topfeatures(nicerJP.dfm)
## to i the and is in of a sport it
## 3635 3098 2996 2464 2428 2197 2011 1474 1377 1332
library(quanteda.textplots)
textplot_wordcloud(nicerJP.dfm)
nicerJP100.dfm <- nicerJP.corpus %>%
tokens(remove_punct = T) %>%
tokens_wordstem %>%
tokens_remove(stopwords("en")) %>% # ここでストップワードを除く
dfm %>%
dfm_trim(min_termfreq=100) # 最低頻度100に設定
nicerJP100.dfm
## Document-feature matrix of: 381 documents, 105 features (68.67% sparse) and 0 docvars.
## features
## docs kind sport like soccer mani world countri japan play import
## JPN501.txt.data 1 6 2 2 6 3 1 1 8 1
## JPN502.txt.data 0 0 0 0 1 1 0 2 2 1
## JPN503.txt.data 0 0 0 0 0 0 0 0 1 1
## JPN504.txt.data 0 17 0 0 3 0 0 0 8 1
## JPN505.txt.data 0 17 2 1 0 0 0 0 5 2
## JPN506.txt.data 0 0 0 0 0 0 1 0 0 6
## [ reached max_ndoc ... 375 more documents, reached max_nfeat ... 95 more features ]
textplot_wordcloud(nicerJP100.dfm)
connectives <- dictionary(list(additive = c("moreover", "further", "furthermore"),
adversative = c("however","nevertheless","conversely"),
resultative = c("therefore", "thus", "consequently")))
dfm_lookup(nicerJP.dfm, dictionary = connectives)
## Document-feature matrix of: 381 documents, 3 features (98.16% sparse) and 0 docvars.
## features
## docs additive adversative resultative
## JPN501.txt.data 0 0 0
## JPN502.txt.data 0 0 0
## JPN503.txt.data 0 0 0
## JPN504.txt.data 0 0 0
## JPN505.txt.data 0 0 0
## JPN506.txt.data 0 0 0
## [ reached max_ndoc ... 375 more documents ]
pronouns <- dictionary(list(first = c("I", "my", "me", "mine"),
second = c("you","your","yours"),
third = c("he", "his", "him", "she", "her", "hers")))
dfm_lookup(nicerJP.dfm, dictionary = pronouns)
## Document-feature matrix of: 381 documents, 3 features (45.06% sparse) and 0 docvars.
## features
## docs first second third
## JPN501.txt.data 0 21 1
## JPN502.txt.data 10 3 0
## JPN503.txt.data 3 1 0
## JPN504.txt.data 2 24 0
## JPN505.txt.data 21 6 0
## JPN506.txt.data 3 15 5
## [ reached max_ndoc ... 375 more documents ]
textstat_collocations(nicerJP.corpus) %>% head()
library(quanteda.textstats)
textstat_collocations(nicerJP.corpus, size = 3, min_count = 100)
multiword <- c("in addition", "on the other hand", "as a result")
rengo <- phrase(multiword)
kwic(nicerJP.corpus, rengo)
rengo.df <- kwic(nicerJP.corpus, rengo)
write.table(rengo.df, "rengo.df.txt") # テキストファイルで保存