自然言語処理で必要な基礎的な処理(parse)を行うパッケージ
基本は、Python。それをC言語で拡張するのがCython
spaceとCythonの合成語
install.packages("spacyr")
library(spacyr)
spacy_install()
spacy_initialize(model = "en_core_web_sm")
## successfully initialized (spaCy Version: 3.7.2, language model: en_core_web_sm)
sample.dat <- readLines("samplepassage.txt")
head(sample.dat)
## [1] "Sleep is an essential part of our daily routine, and it is crucial for our physical and mental well-being. It is a time when our body and mind can rest and recharge, allowing us to wake up feeling refreshed and ready to tackle the day ahead. However, despite its importance, many people do not get enough sleep, which can lead to a range of negative consequences."
## [2] "One of the most significant benefits of sleep is that it helps to improve our physical health. During sleep, our body repairs and regenerates tissues, strengthens our immune system, and helps to regulate our hormones. This means that getting enough sleep can help to reduce the risk of developing chronic health conditions such as heart disease, diabetes, and obesity."
## [3] "In addition to its physical benefits, sleep is also essential for our mental health. Lack of sleep can lead to mood swings, irritability, and difficulty concentrating, which can have a significant impact on our daily lives. It can also increase the risk of developing mental health conditions such as depression and anxiety."
## [4] "Despite the importance of sleep, many people struggle to get enough of it. There are several reasons for this, including stress, anxiety, and poor sleep habits. However, there are several things that we can do to improve our sleep quality. For example, establishing a regular sleep schedule, avoiding caffeine and alcohol before bedtime, and creating a relaxing sleep environment can all help to promote better sleep."
## [5] "In conclusion, sleep is an essential part of our daily routine, and it is crucial for our physical and mental well-being. Getting enough sleep can help to improve our physical health, reduce the risk of developing chronic health conditions, and promote better mental health. Therefore, it is important to prioritize sleep and take steps to ensure that we are getting enough of it. By doing so, we can wake up feeling refreshed and ready to tackle the day ahead."
library(spacyr)
spacy_initialize(model = "en_core_web_sm")
## spaCy is already initialized
## NULL
sample.dat.token <- spacy_tokenize(sample.dat)
head(sample.dat.token)
## $text1
## [1] "Sleep" "is" "an" "essential" "part"
## [6] "of" "our" "daily" "routine" ","
## [11] "and" "it" "is" "crucial" "for"
## [16] "our" "physical" "and" "mental" "well"
## [21] "-" "being" "." "It" "is"
## [26] "a" "time" "when" "our" "body"
## [31] "and" "mind" "can" "rest" "and"
## [36] "recharge" "," "allowing" "us" "to"
## [41] "wake" "up" "feeling" "refreshed" "and"
## [46] "ready" "to" "tackle" "the" "day"
## [51] "ahead" "." "However" "," "despite"
## [56] "its" "importance" "," "many" "people"
## [61] "do" "not" "get" "enough" "sleep"
## [66] "," "which" "can" "lead" "to"
## [71] "a" "range" "of" "negative" "consequences"
## [76] "."
##
## $text2
## [1] "One" "of" "the" "most" "significant"
## [6] "benefits" "of" "sleep" "is" "that"
## [11] "it" "helps" "to" "improve" "our"
## [16] "physical" "health" "." "During" "sleep"
## [21] "," "our" "body" "repairs" "and"
## [26] "regenerates" "tissues" "," "strengthens" "our"
## [31] "immune" "system" "," "and" "helps"
## [36] "to" "regulate" "our" "hormones" "."
## [41] "This" "means" "that" "getting" "enough"
## [46] "sleep" "can" "help" "to" "reduce"
## [51] "the" "risk" "of" "developing" "chronic"
## [56] "health" "conditions" "such" "as" "heart"
## [61] "disease" "," "diabetes" "," "and"
## [66] "obesity" "."
##
## $text3
## [1] "In" "addition" "to" "its"
## [5] "physical" "benefits" "," "sleep"
## [9] "is" "also" "essential" "for"
## [13] "our" "mental" "health" "."
## [17] "Lack" "of" "sleep" "can"
## [21] "lead" "to" "mood" "swings"
## [25] "," "irritability" "," "and"
## [29] "difficulty" "concentrating" "," "which"
## [33] "can" "have" "a" "significant"
## [37] "impact" "on" "our" "daily"
## [41] "lives" "." "It" "can"
## [45] "also" "increase" "the" "risk"
## [49] "of" "developing" "mental" "health"
## [53] "conditions" "such" "as" "depression"
## [57] "and" "anxiety" "."
##
## $text4
## [1] "Despite" "the" "importance" "of" "sleep"
## [6] "," "many" "people" "struggle" "to"
## [11] "get" "enough" "of" "it" "."
## [16] "There" "are" "several" "reasons" "for"
## [21] "this" "," "including" "stress" ","
## [26] "anxiety" "," "and" "poor" "sleep"
## [31] "habits" "." "However" "," "there"
## [36] "are" "several" "things" "that" "we"
## [41] "can" "do" "to" "improve" "our"
## [46] "sleep" "quality" "." "For" "example"
## [51] "," "establishing" "a" "regular" "sleep"
## [56] "schedule" "," "avoiding" "caffeine" "and"
## [61] "alcohol" "before" "bedtime" "," "and"
## [66] "creating" "a" "relaxing" "sleep" "environment"
## [71] "can" "all" "help" "to" "promote"
## [76] "better" "sleep" "."
##
## $text5
## [1] "In" "conclusion" "," "sleep" "is"
## [6] "an" "essential" "part" "of" "our"
## [11] "daily" "routine" "," "and" "it"
## [16] "is" "crucial" "for" "our" "physical"
## [21] "and" "mental" "well" "-" "being"
## [26] "." "Getting" "enough" "sleep" "can"
## [31] "help" "to" "improve" "our" "physical"
## [36] "health" "," "reduce" "the" "risk"
## [41] "of" "developing" "chronic" "health" "conditions"
## [46] "," "and" "promote" "better" "mental"
## [51] "health" "." "Therefore" "," "it"
## [56] "is" "important" "to" "prioritize" "sleep"
## [61] "and" "take" "steps" "to" "ensure"
## [66] "that" "we" "are" "getting" "enough"
## [71] "of" "it" "." "By" "doing"
## [76] "so" "," "we" "can" "wake"
## [81] "up" "feeling" "refreshed" "and" "ready"
## [86] "to" "tackle" "the" "day" "ahead"
## [91] "."
sample.dat.token <- spacy_tokenize(sample.dat, remove_punct = TRUE, output = "data.frame")
head(sample.dat.token)
## doc_id token
## 1 text1 Sleep
## 2 text1 is
## 3 text1 an
## 4 text1 essential
## 5 text1 part
## 6 text1 of
sample.dat.parsed <- spacy_parse(sample.dat)
head(sample.dat.parsed)
## doc_id sentence_id token_id token lemma pos entity
## 1 text1 1 1 Sleep sleep NOUN
## 2 text1 1 2 is be AUX
## 3 text1 1 3 an an DET
## 4 text1 1 4 essential essential ADJ
## 5 text1 1 5 part part NOUN
## 6 text1 1 6 of of ADP
str(sample.dat.parsed)
## Classes 'spacyr_parsed' and 'data.frame': 371 obs. of 7 variables:
## $ doc_id : chr "text1" "text1" "text1" "text1" ...
## $ sentence_id: int 1 1 1 1 1 1 1 1 1 1 ...
## $ token_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ token : chr "Sleep" "is" "an" "essential" ...
## $ lemma : chr "sleep" "be" "an" "essential" ...
## $ pos : chr "NOUN" "AUX" "DET" "ADJ" ...
## $ entity : chr "" "" "" "" ...
pos = TRUE,
tag = FALSE,
lemma = TRUE,
entity = TRUE,
dependency = FALSE,
nounphrase = FALSE,
multithread = TRUE,
sample.dat.parsed <- spacy_parse(sample.dat, tag = T, dependency = T, nounphrase = T)
head(sample.dat.parsed)
## doc_id sentence_id token_id token lemma pos tag head_token_id
## 1 text1 1 1 Sleep sleep NOUN NN 2
## 2 text1 1 2 is be AUX VBZ 2
## 3 text1 1 3 an an DET DT 5
## 4 text1 1 4 essential essential ADJ JJ 5
## 5 text1 1 5 part part NOUN NN 2
## 6 text1 1 6 of of ADP IN 5
## dep_rel entity nounphrase whitespace
## 1 nsubj beg_root TRUE
## 2 ROOT TRUE
## 3 det beg TRUE
## 4 amod mid TRUE
## 5 attr end_root TRUE
## 6 prep TRUE
nounphrase_extract(sample.dat.parsed)
## doc_id sentence_id nounphrase
## 1 text1 1 Sleep
## 2 text1 1 an_essential_part
## 3 text1 1 our_daily_routine
## 4 text1 1 it
## 5 text1 1 our_physical_and_mental_well-being
## 6 text1 2 It
## 7 text1 2 a_time
## 8 text1 2 our_body
## 9 text1 2 mind
## 10 text1 2 us
## 11 text1 2 feeling
## 12 text1 2 the_day
## 13 text1 3 its_importance
## 14 text1 3 many_people
## 15 text1 3 enough_sleep
## 16 text1 3 which
## 17 text1 3 a_range
## 18 text1 3 negative_consequences
## 19 text2 1 the_most_significant_benefits
## 20 text2 1 sleep
## 21 text2 1 it
## 22 text2 1 our_physical_health
## 23 text2 2 sleep
## 24 text2 2 our_body_repairs_and_regenerates_tissues
## 25 text2 2 our_immune_system
## 26 text2 2 our_hormones
## 27 text2 3 This
## 28 text2 3 enough_sleep
## 29 text2 3 the_risk
## 30 text2 3 chronic_health_conditions
## 31 text2 3 heart_disease
## 32 text2 3 obesity
## 33 text3 1 addition
## 34 text3 1 its_physical_benefits
## 35 text3 1 sleep
## 36 text3 1 our_mental_health
## 37 text3 2 Lack
## 38 text3 2 sleep
## 39 text3 2 mood_swings
## 40 text3 2 irritability
## 41 text3 2 difficulty
## 42 text3 2 which
## 43 text3 2 a_significant_impact
## 44 text3 2 our_daily_lives
## 45 text3 3 It
## 46 text3 3 the_risk
## 47 text3 3 mental_health_conditions
## 48 text3 3 depression
## 49 text3 3 anxiety
## 50 text4 1 the_importance
## 51 text4 1 sleep
## 52 text4 1 many_people
## 53 text4 1 it
## 54 text4 2 several_reasons
## 55 text4 2 this
## 56 text4 2 stress
## 57 text4 2 anxiety
## 58 text4 2 poor_sleep_habits
## 59 text4 3 several_things
## 60 text4 3 that
## 61 text4 3 we
## 62 text4 3 our_sleep_quality
## 63 text4 4 example
## 64 text4 4 a_regular_sleep_schedule
## 65 text4 4 caffeine
## 66 text4 4 alcohol
## 67 text4 4 bedtime
## 68 text4 4 a_relaxing_sleep_environment
## 69 text4 4 better_sleep
## 70 text5 1 conclusion
## 71 text5 1 sleep
## 72 text5 1 an_essential_part
## 73 text5 1 our_daily_routine
## 74 text5 1 it
## 75 text5 1 our_physical_and_mental_well-being
## 76 text5 2 enough_sleep
## 77 text5 2 our_physical_health
## 78 text5 2 the_risk
## 79 text5 2 chronic_health_conditions
## 80 text5 2 better_mental_health
## 81 text5 3 it
## 82 text5 3 sleep
## 83 text5 3 steps
## 84 text5 3 we
## 85 text5 3 it
## 86 text5 4 we
## 87 text5 4 the_day
library(dplyr)
##
## 次のパッケージを付け加えます: 'dplyr'
## 以下のオブジェクトは 'package:stats' からマスクされています:
##
## filter, lag
## 以下のオブジェクトは 'package:base' からマスクされています:
##
## intersect, setdiff, setequal, union
nounphrase_consolidate(sample.dat.parsed) %>% head()
## Note: removing head_token_id, dep_rel for nounphrases
## doc_id sentence_id token_id token lemma pos
## 1 text1 1 1 Sleep sleep nounphrase
## 2 text1 1 2 is be AUX
## 3 text1 1 3 an_essential_part an_essential_part nounphrase
## 4 text1 1 4 of of ADP
## 5 text1 1 5 our_daily_routine our_daily_routine nounphrase
## 6 text1 1 6 , , PUNCT
## tag
## 1 nounphrase
## 2 VBZ
## 3 nounphrase
## 4 IN
## 5 nounphrase
## 6 ,
spacy_extract_nounphrases(sample.dat) %>% head()
## doc_id text root_text start_id root_id length
## 1 text1 Sleep Sleep 1 1 1
## 2 text1 an essential part part 3 5 3
## 3 text1 our daily routine routine 7 9 3
## 4 text1 it it 12 12 1
## 5 text1 our physical and mental well-being being 16 22 7
## 6 text1 It It 24 24 1
install.packages("textplot", dependencies = T)
library(textplot)
str(sample.dat.parsed)
## Classes 'spacyr_parsed' and 'data.frame': 371 obs. of 12 variables:
## $ doc_id : chr "text1" "text1" "text1" "text1" ...
## $ sentence_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ token_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ token : chr "Sleep" "is" "an" "essential" ...
## $ lemma : chr "sleep" "be" "an" "essential" ...
## $ pos : chr "NOUN" "AUX" "DET" "ADJ" ...
## $ tag : chr "NN" "VBZ" "DT" "JJ" ...
## $ head_token_id: num 2 2 5 5 2 5 9 9 6 2 ...
## $ dep_rel : chr "nsubj" "ROOT" "det" "amod" ...
## $ entity : chr "" "" "" "" ...
## $ nounphrase : chr "beg_root" "" "beg" "mid" ...
## $ whitespace : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
library(textplot)
library(igraph)
##
## 次のパッケージを付け加えます: 'igraph'
## 以下のオブジェクトは 'package:dplyr' からマスクされています:
##
## as_data_frame, groups, union
## 以下のオブジェクトは 'package:stats' からマスクされています:
##
## decompose, spectrum
## 以下のオブジェクトは 'package:base' からマスクされています:
##
## union
library(dplyr)
sample.dat.parsed %>% mutate(upos = pos) %>% filter(doc_id == "text2" , sentence_id == 1) %>% textplot_dependencyparser()
## 要求されたパッケージ ggraph をロード中です
An R package for the Quantitative Analysis of Textual Data
install.packages("quanteda")
library(quanteda)
## Package version: 3.3.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 16 of 16 threads used.
## See https://quanteda.io for tutorials and examples.
install.packages("readtext", dependencies = T)
library(readtext)
##
## 次のパッケージを付け加えます: 'readtext'
## 以下のオブジェクトは 'package:quanteda' からマスクされています:
##
## texts
setwd("NICEST/JAN_plain/")
nicestJP.dat <- readtext("*.txt")
nicestJP.corpus <- quanteda::corpus(nicestJP.dat)
summary(nicestJP.corpus) %>% head()
## Text Types Tokens Sentences
## 1 JAN0001_P1B.txt 116 214 12
## 2 JAN0001_P2B.txt 138 268 17
## 3 JAN0001_P3B.txt 97 169 11
## 4 JAN0001_P4B.txt 68 99 8
## 5 JAN0001_P5B.txt 120 262 16
## 6 JAN0001_P6B.txt 114 224 13
kwic(nicestJP.corpus, pattern="however") %>% head()
## Warning: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
## Keyword-in-context with 6 matches.
## [JAN0001_P1B.txt, 13] not important for human, | however |
## [JAN0001_P2B.txt, 31] compared with young people, | however |
## [JAN0001_P7B.txt, 71] misunderstanding about that area. | However |
## [JAN0002_P2A.txt, 100] . It has protected them | however |
## [JAN0002_P5A.txt, 137] limited in our earth. | However |
## [JAN0002_P8A.txt, 71] unfortunately stuation or saddness. | However |
##
## , who make todays life
## , I recognize that is
## , if you only know
## we must live alone.
## a train is costed low
## , these experience make people
tokens(nicestJP.corpus)
## Tokens consisting of 1,836 documents.
## JAN0001_P1B.txt :
## [1] "Some" "people" "say" "that" "specialized"
## [6] "knowledge" "is" "not" "important" "for"
## [11] "human" ","
## [ ... and 202 more ]
##
## JAN0001_P2B.txt :
## [1] "You" "may" "think" "that" "young" "people" "are" "active"
## [9] "and" "free" "," "on"
## [ ... and 256 more ]
##
## JAN0001_P3B.txt :
## [1] "Compared" "with" "past" "," "young" "people"
## [7] "nowadays" "do" "not" "give" "enough" "time"
## [ ... and 157 more ]
##
## JAN0001_P4B.txt :
## [1] "You" "may" "have" "experiences" "like"
## [6] "this" "," "feel" "nice" "at"
## [11] "products" "in"
## [ ... and 87 more ]
##
## JAN0001_P5B.txt :
## [1] "Elderly" "person" "often" "says" "that" "young" "men"
## [8] "does" "not" "seem" "to" "want"
## [ ... and 250 more ]
##
## JAN0001_P6B.txt :
## [1] "Group" "tourisms" "are" "easy" "to" "attend"
## [7] "so" "many" "people" "use" "them" "to"
## [ ... and 212 more ]
##
## [ reached max_ndoc ... 1,830 more documents ]
tokens(nicestJP.corpus, what="sentence")
## Tokens consisting of 1,836 documents.
## JAN0001_P1B.txt :
## [1] "Some people say that specialized knowledge is not important for human, however, who make todays life such a convenience are always a few number of genius with very specific knowledges."
## [2] "To consider this, it can be said that to specialized in one specific subject is better than to get bload knowledge of many academic subjects."
## [3] "There is some more reasons and examples which supports my opinion."
## [4] "First of all, I want you to know that the knowledge does not specialized do nothing."
## [5] "Even if you are walking dictionary, it does not produce anything."
## [6] "Only you have special skills, you can get something by using it."
## [7] "So if you want to earn money or do something big, you should to get specialized knowledges."
## [8] "Next, I think the later is better because bload knowledge is not so rare."
## [9] "When you do not know about such a common thing, you only have to use dictionaries to understand."
## [10] "You need not to become walking dictionary because you can use dictionaries."
## [11] "To sum up, for its usefulness you ought to become specialized in one subject."
## [12] "It must help your life much more than bload knowledge will do."
##
## JAN0001_P2B.txt :
## [1] "You may think that young people are active and free, on the other hand olders are less active and they have much little freedom compared with young people, however, I recognize that is incorrect."
## [2] "So I disagree with above sentences."
## [3] "I will explain why I think this way with following three reasons."
## [4] "First of all, I want you know that young person can not have enough time while old person can."
## [5] "For their works or homeworks, young people have less free time."
## [6] "So sometimes they can not do what they like."
## [7] "Not only young person can not have enough free time, they are suffered from fear of future."
## [8] "No young person can avoid to worry about their future."
## [9] "It is so stressful."
## [10] "But in case of old person, they need not to worry like this."
## [11] "My grandfather always says that he can not enjoy his young age because of them, but now, he enjoys his old life without being disappointed at his future."
## [12] "This is second reason."
## [ ... and 5 more ]
##
## JAN0001_P3B.txt :
## [1] "Compared with past, young people nowadays do not give enough time to helping their communities."
## [2] "I guess there is some reasons causes this situation."
## [3] "First of all, youngman is more busy to do homework or finding jobs than parents was."
## [4] "Due to it, sutudents and new saralyman can not have enough time."
## [5] "Next, now, ecconomical situation in Japan is bad and this gave young person big pressure."
## [6] "The fear make them be too tired to help families."
## [7] "Third reason is change of the thinking among the people."
## [8] "In pastdays, communities had much bigger power than today."
## [9] "So it was natural for people to help it even if they had to do much effort."
## [10] "But now, young person do not think such a thing."
## [11] "To sum up, because of their busyness, fear, and change in the thinking way make them do not give enough time to helping their communities."
##
## JAN0001_P4B.txt :
## [1] "You may have experiences like this, feel nice at products in some advertisement but you buy and see it, you disappointed."
## [2] "What causes this?"
## [3] "I think it is because producer want to earn money even if they deceive consumers."
## [4] "In advertisements, to make products good then real is easy."
## [5] "First, as most people, producers want money."
## [6] "And to earn money by salling, showing their products as great as they can is important things."
## [7] "So they tell lie us."
## [8] "Using movie or picture, they deceive."
##
## JAN0001_P5B.txt :
## [1] "Elderly person often says that young men does not seem to want to have the car today."
## [2] "And I think so too."
## [3] "So I agree the following statement."
## [4] "From now, I will explain what will decrease the number of cars in future by some reasons."
## [5] "First of all, recent young people have been taught that cars can be cause of the green house effect by producing some gas."
## [6] "Pastdays, human did not know that and buy cars."
## [7] "But now and in future, for the fear of this enviromental crisis, people will avoid buying cars as possible."
## [8] "I think this is the biggest reason."
## [9] "In addition above reason, people will notice that they can move faster than cars by using trains."
## [10] "Normally, cars is not permitted to run faster than trains."
## [11] "In the view of like this, people in twenty years later seem to use trains than cars."
## [12] "Third, in the view of safety, cars are dangarous."
## [ ... and 4 more ]
##
## JAN0001_P6B.txt :
## [1] "Group tourisms are easy to attend so many people use them to travel."
## [2] "But to think of it, group travels is not the best way of traveling."
## [3] "There are three reasons supports this opinion in this sentence."
## [4] "First, when you participate in a tour with guide, it costs higher."
## [5] "If you travel by yourself without any guides, you can discount your travering cost."
## [6] "What is more, for the discounting, you not only save money, but also you can enjoy next travering earlier."
## [7] "Second negative effects of a tour guide is freedom."
## [8] "In group, you may not be able to go to where you realy want to go."
## [9] "You should go somewhere you really wan na go in your travel if you are to make that trip the best."
## [10] "Finally, if you join some group of tour by calling or mailing and travel in the same way with other guests, your trip is not so much interesting."
## [11] "You should think about where you going to go before you really travel and make your traveling special to enjoy your journy."
## [12] "It is the best way of traveling."
## [ ... and 1 more ]
##
## [ reached max_ndoc ... 1,830 more documents ]
tokens(nicestJP.corpus, what="word", remove_numbers=T, remove_punct=T)
## Tokens consisting of 1,836 documents.
## JAN0001_P1B.txt :
## [1] "Some" "people" "say" "that" "specialized"
## [6] "knowledge" "is" "not" "important" "for"
## [11] "human" "however"
## [ ... and 180 more ]
##
## JAN0001_P2B.txt :
## [1] "You" "may" "think" "that" "young" "people" "are" "active"
## [9] "and" "free" "on" "the"
## [ ... and 225 more ]
##
## JAN0001_P3B.txt :
## [1] "Compared" "with" "past" "young" "people" "nowadays"
## [7] "do" "not" "give" "enough" "time" "to"
## [ ... and 136 more ]
##
## JAN0001_P4B.txt :
## [1] "You" "may" "have" "experiences" "like"
## [6] "this" "feel" "nice" "at" "products"
## [11] "in" "some"
## [ ... and 72 more ]
##
## JAN0001_P5B.txt :
## [1] "Elderly" "person" "often" "says" "that" "young" "men"
## [8] "does" "not" "seem" "to" "want"
## [ ... and 217 more ]
##
## JAN0001_P6B.txt :
## [1] "Group" "tourisms" "are" "easy" "to" "attend"
## [7] "so" "many" "people" "use" "them" "to"
## [ ... and 188 more ]
##
## [ reached max_ndoc ... 1,830 more documents ]
tokens(nicestJP.corpus, what="word", remove_numbers=T, remove_punct=T) %>% tokens_tolower()
## Tokens consisting of 1,836 documents.
## JAN0001_P1B.txt :
## [1] "some" "people" "say" "that" "specialized"
## [6] "knowledge" "is" "not" "important" "for"
## [11] "human" "however"
## [ ... and 180 more ]
##
## JAN0001_P2B.txt :
## [1] "you" "may" "think" "that" "young" "people" "are" "active"
## [9] "and" "free" "on" "the"
## [ ... and 225 more ]
##
## JAN0001_P3B.txt :
## [1] "compared" "with" "past" "young" "people" "nowadays"
## [7] "do" "not" "give" "enough" "time" "to"
## [ ... and 136 more ]
##
## JAN0001_P4B.txt :
## [1] "you" "may" "have" "experiences" "like"
## [6] "this" "feel" "nice" "at" "products"
## [11] "in" "some"
## [ ... and 72 more ]
##
## JAN0001_P5B.txt :
## [1] "elderly" "person" "often" "says" "that" "young" "men"
## [8] "does" "not" "seem" "to" "want"
## [ ... and 217 more ]
##
## JAN0001_P6B.txt :
## [1] "group" "tourisms" "are" "easy" "to" "attend"
## [7] "so" "many" "people" "use" "them" "to"
## [ ... and 188 more ]
##
## [ reached max_ndoc ... 1,830 more documents ]
tokens(nicestJP.corpus, what="word", remove_numbers=T, remove_punct=T) %>% tokens_tolower() %>% tokens_wordstem()
## Tokens consisting of 1,836 documents.
## JAN0001_P1B.txt :
## [1] "some" "peopl" "say" "that" "special" "knowledg"
## [7] "is" "not" "import" "for" "human" "howev"
## [ ... and 180 more ]
##
## JAN0001_P2B.txt :
## [1] "you" "may" "think" "that" "young" "peopl" "are" "activ" "and"
## [10] "free" "on" "the"
## [ ... and 225 more ]
##
## JAN0001_P3B.txt :
## [1] "compar" "with" "past" "young" "peopl" "nowaday" "do"
## [8] "not" "give" "enough" "time" "to"
## [ ... and 136 more ]
##
## JAN0001_P4B.txt :
## [1] "you" "may" "have" "experi" "like" "this" "feel"
## [8] "nice" "at" "product" "in" "some"
## [ ... and 72 more ]
##
## JAN0001_P5B.txt :
## [1] "elder" "person" "often" "say" "that" "young" "men" "doe"
## [9] "not" "seem" "to" "want"
## [ ... and 217 more ]
##
## JAN0001_P6B.txt :
## [1] "group" "tourism" "are" "easi" "to" "attend" "so"
## [8] "mani" "peopl" "use" "them" "to"
## [ ... and 188 more ]
##
## [ reached max_ndoc ... 1,830 more documents ]
オプション * 句読点の削除 remove_punct=T
tokens(
x,
what = "word",
remove_punct = FALSE,
remove_symbols = FALSE,
remove_numbers = FALSE,
remove_url = FALSE,
remove_separators = TRUE,
split_hyphens = FALSE,
split_tags = FALSE,
include_docvars = TRUE,
padding = FALSE,
verbose = quanteda_options("verbose"),
...
)
機能語などの頻出語(stopwords)を除く
コーパスデータ処理をする際の専用フォーマット
事前に tokens()をかけたものをdfm()する
dfm(tokens(corpus(テキストデータ)))
tmp <- tokens(nicestJP.corpus, remove_punct=T)
nicestJP.corpus.dfm <- dfm(tmp)
nicestJP.corpus.dfm
## Document-feature matrix of: 1,836 documents, 10,375 features (99.02% sparse) and 0 docvars.
## features
## docs some people say that specialized knowledge is not important
## JAN0001_P1B.txt 2 1 1 3 5 5 5 6 1
## JAN0001_P2B.txt 0 3 0 4 0 0 3 7 0
## JAN0001_P3B.txt 1 3 0 0 0 0 4 4 0
## JAN0001_P4B.txt 1 1 0 0 0 0 3 0 1
## JAN0001_P5B.txt 2 7 0 6 0 0 3 5 0
## JAN0001_P6B.txt 1 1 0 1 0 0 5 5 0
## features
## docs for
## JAN0001_P1B.txt 2
## JAN0001_P2B.txt 2
## JAN0001_P3B.txt 1
## JAN0001_P4B.txt 0
## JAN0001_P5B.txt 1
## JAN0001_P6B.txt 2
## [ reached max_ndoc ... 1,830 more documents, reached max_nfeat ... 10,365 more features ]
topfeatures(nicestJP.corpus.dfm, 20)
## to the in is and i people of you that a
## 13246 10230 7882 7453 7290 6723 6684 6226 5680 5284 5072
## are it have we they not can for do
## 5042 4882 4813 4793 4620 4317 4135 3719 3427
library(quanteda.textplots)
##
## 次のパッケージを付け加えます: 'quanteda.textplots'
## 以下のオブジェクトは 'package:igraph' からマスクされています:
##
## as.igraph
textplot_wordcloud(nicestJP.corpus.dfm)
tmp <- tokens(nicestJP.corpus, remove_punct=T)
tmp1 <- tokens_remove(tmp, stopwords("en"))
tmp2 <- dfm(tmp1)
textplot_wordcloud(tmp2)
tmp <- tokens(nicestJP.corpus, remove_punct=T)
tmp1 <- tokens_remove(tmp, stopwords("en"))
tmp2 <- dfm(tmp1)
tmp3 <- dfm_trim(tmp2, min_termfreq=300)
textplot_wordcloud(tmp3)
library(quanteda.textstats)
textstat_collocations(nicestJP.corpus) %>% head()
## collocation count count_nested length lambda z
## 1 it is 2421 0 2 4.411601 128.56187
## 2 i think 1647 0 2 4.626458 107.84185
## 3 there are 1311 0 2 5.000913 101.90447
## 4 new things 874 0 2 5.525707 96.98483
## 5 young people 1779 0 2 6.006212 93.94900
## 6 do not 1041 0 2 3.973480 93.42041
textstat_collocations(nicestJP.corpus, size = 3, min_count = 100) %>% head()
## collocation count count_nested length lambda z
## 1 is in a 100 0 3 4.270856 16.13186
## 2 to travel is 154 0 3 3.430332 14.85661
## 3 more and more 167 0 3 10.440850 14.06000
## 4 travel is in 100 0 3 6.123315 13.78210
## 5 they really are 275 0 3 6.050296 13.64728
## 6 cars in use 240 0 3 5.885493 13.03403
multiword <- c("in addition", "on the other hand", "as a result")
rengo <- phrase(multiword)
kwic(nicestJP.corpus, rengo) %>% head()
## Warning: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
## Keyword-in-context with 6 matches.
## [JAN0001_P2B.txt, 12:15] are active and free, | on the other hand |
## [JAN0001_P5B.txt, 117:118] is the biggest reason. | In addition |
## [JAN0001_P7B.txt, 196:199] answer ten questions. But | on the other hand |
## [JAN0002_P3A.txt, 88:91] to lead thier fammilies. | On the other hand |
## [JAN0003_P2B.txt, 95:97] at school or college. | As a result |
## [JAN0004_P5B.txt, 121:122] we don't use cars. | In addition |
##
## olders are less active and
## above reason, people will
## , if you understanding ideas
## , boy and girls are
## , they get a good
## , when we go somewhere
library(openxlsx)
rengo.dat <- kwic(nicestJP.corpus, rengo)
## Warning: 'kwic.corpus()' is deprecated. Use 'tokens()' first.
rengo.dat %>% write.xlsx("rengo.dat.xlsx")