install.packages("quanteda", dependencies=T, repos="http://cran.rstudio.com/")
library(quanteda)
## Warning in stringi::stri_info(): Your native charset is not a superset of US-
## ASCII. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your native charset is not a superset of US-
## ASCII. This may cause serious problems. Consider switching to UTF-8.
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
install.packages("readtext", dependencies = T, repos="http://cran.rstudio.com/")
library(readtext)
タグなどのついていない、テキストデータのみのファイルを、フォルダーにまとめて入れておく。
NICERの学習者データについて、テキストのみのファイルに変換する
https://sugiura-ken.org/wiki/wiki.cgi/exp?page=chatToText
#chatToText
#2020-01-17 sugiura@nagoya-u.jp
#CHATフォーマットのファイルから、
#本文のメインティアのテキストだけを抜き出して
#もとのファイル名に .data を付け足したファイル名で保存する。
chatToText <- function(){
#ディレクトリー内のすべてのテキストファイルを対象に
here <- getwd()
files <- list.files(here, pattern="\\.txt$")
#読み込むファイル名で、.txt という拡張子でファイル名が終わるものを指定
for (i in files){
lines.tmp <- scan(i, what="char", sep="\n")
lines.tmp <- scan(i, what="char", sep="\n")
data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
body.tmp <- body.tmp[body.tmp != ""]
filename <- i
filename <- as.factor(filename)
filename <- paste(filename, ".data", sep="")
#もとのファイル名に .data という文字列を追加
#ファイル名の終わりが .txt ではなくなるので再帰的に読み込まれない
write(body.tmp, file=filename)
}
}
setwd("NICER1_3/NICER_NNS")
chatToText()
setwd("NICER1_3/NICER_NNS")
list.files()
options(warn=-1)
setwd("NICER1_3/NICER_NNS")
nicerJP.data <- readtext("*.data")
nicerJP.data
nicerJP.corpus <- corpus(nicerJP.data)
nicerJP.corpus
## Corpus consisting of 381 documents.
## JPN501.txt.data :
## "What kind of sports do you like? Do you like soccer, base ba..."
##
## JPN502.txt.data :
## "Education of "YUTORI" There was the education system that ca..."
##
## JPN503.txt.data :
## "educational policy What do you think about "yutori kyouiku"?..."
##
## JPN504.txt.data :
## "The impact of sports You often play sports. In elementary sc..."
##
## JPN505.txt.data :
## "About sports I want to talk about doing sports. To tell you ..."
##
## JPN506.txt.data :
## "Is money the most important thing? I often hear that which i..."
##
## [ reached max_ndoc ... 375 more documents ]
summary(nicerJP.corpus)
*古いバージョンでは texts() だった
as.character(nicerJP.corpus)[3]
## JPN503.txt.data
## "educational policy\nWhat do you think about \"yutori kyouiku\"?\nIt is the educational policy started about a decade ago.\nBecause people had been educated based on learning as much knowledge as possible since WWII and they become to luck an ability of thinking by themselves, Japanese government started it to enable Japanese students to study objectively and acquire the ability to live by themselves.\nNowadays, they say this educational policy has much room to improve.\nIt is obvious that as they could learn less knowledge than before, they become not to be able to think and act objectively from the wide and deep prospect.\nSo I am for this critical argument generally.\nBut I think we should not see it as an entirely useless and incorrect policy.\nThen, what should we do?\nI think it is important to analyze what points lead to fail scientifically and acknowledge it precisely.\nFor example, Shortening school time is said to be bad for students because they are playing video games instead.\nIt is partially true, but when it comes to the children from upper society, actually they get more time to study effectively.\nUnless we argue this problem deeply, we will fail to again."
kwic(nicerJP.corpus, "however")
行と列で、テキストごとに単語一覧を作成
どの文書にどの単語が入っているかを統一的に操作(分析)できる
対象は、コーパスデータ
オプション
*句読点の削除オプション: remove_punct = T
nicerJP.dfm <- dfm(nicerJP.corpus, stem=T, remove_punct=T)
nicerJP.dfm
## Document-feature matrix of: 381 documents, 3,181 features (96.26% sparse) and 0 docvars.
## features
## docs what kind of sport do you like soccer base ball
## JPN501.txt.data 1 1 3 6 2 19 2 2 1 1
## JPN502.txt.data 3 0 7 0 1 2 0 0 0 0
## JPN503.txt.data 3 0 1 0 2 1 0 0 1 0
## JPN504.txt.data 0 0 3 17 4 13 0 0 0 0
## JPN505.txt.data 0 0 8 17 13 5 2 1 0 0
## JPN506.txt.data 1 0 3 0 3 14 0 0 0 0
## [ reached max_ndoc ... 375 more documents, reached max_nfeat ... 3,171 more features ]
nicerJP.dfm2 <- dfm_wordstem(dfm(tokens(nicerJP.corpus, remove_punct = T)))
nicerJP.dfm2
## Document-feature matrix of: 381 documents, 3,181 features (96.26% sparse) and 0 docvars.
## features
## docs what kind of sport do you like soccer base ball
## JPN501.txt.data 1 1 3 6 2 19 2 2 1 1
## JPN502.txt.data 3 0 7 0 1 2 0 0 0 0
## JPN503.txt.data 3 0 1 0 2 1 0 0 1 0
## JPN504.txt.data 0 0 3 17 4 13 0 0 0 0
## JPN505.txt.data 0 0 8 17 13 5 2 1 0 0
## JPN506.txt.data 1 0 3 0 3 14 0 0 0 0
## [ reached max_ndoc ... 375 more documents, reached max_nfeat ... 3,171 more features ]
summary(nicerJP.dfm)
## Length Class Mode
## 1211961 dfm S4
summary(nicerJP.dfm2)
## Length Class Mode
## 1211961 dfm S4
View(nicerJP.dfm)
topfeatures(nicerJP.dfm)
## to i the and is in of a sport it
## 3635 3098 2997 2465 2428 2197 2012 1474 1379 1336
topfeatures(nicerJP.dfm2)
## to i the and is in of a sport it
## 3635 3098 2997 2465 2428 2197 2012 1474 1379 1336
install.packages("quanteda.textplots")
library(quanteda.textplots)
textplot_wordcloud(nicerJP.dfm)
textplot_wordcloud(nicerJP.dfm2)
nicerJP.dfm3 <- dfm_wordstem(dfm(tokens_remove(tokens(nicerJP.corpus, remove_punct = T), stopwords("en"))))
textplot_wordcloud(nicerJP.dfm3)
connectives <- dictionary(list(additive = c("moreover", "further", "furthermore"),
adversative = c("however","nevertheless","conversely"),
resultative = c("therefore", "thus", "consequently")))
connectives.dfm <- dfm(nicerJP.corpus, dictionary = connectives)
View(connectives.dfm)
install.packages("quanteda.textstats")
library(quanteda.textstats)
textstat_collocations(nicerJP.corpus)
textstat_collocations(nicerJP.corpus, size = 3, min_count = 100)
multiword <- c("in addition", "on the other hand", "as a result")
rengo <- phrase(multiword)
kwic(nicerJP.corpus, rengo)
rengo.df <- kwic(nicerJP.corpus, rengo)
write.table(rengo.df, "rengo.df.txt")