{{category R.package}} !!!spacyr {{outline}} ---- https://cran.r-project.org/web/packages/spacyr/index.html https://rdrr.io/cran/spacyr/f/vignettes/using_spacyr.Rmd PythonのspaCyパッケージのラッパー 1. テキストを構文解析し、トークンと文に 2. トークンのレマ化 3. 文法依存関係 4. 連語検索 先にAnacondaとPythonをインストールして、 Pythonで、Spacyをインストールしておく必要がある。 !!インストール {{pre install.packages("spacyr") library(spacyr) spacy_install() }} !!使用の実際 !サンプルデータ:JPN501.txt from NICER *本文部分だけにしたテキストファイル: JPN501.txt.data {{pre jpn501 <- scan("JPN501.txt.data", what="char", sep="\n") head(jpn501) [1] "What kind of sports do you like?" [2] "Do you like soccer, base ball or swimming?" [3] "There are many and variety sports around the world." [4] "A country has some traditional sports." [5] "Of course, there are some traditional sports in Japan." [6] "They are called \"BUDO\"." }} !使い始め *英語を指定 {{pre spacy_initialize(model = "en_core_web_sm") }} !テキスト解析(オプションなしだと、POS付与):spacy_parse(txt) {{pre jpn501.parsed <- spacy_parse(jpn501) jpn501.parsed doc_id sentence_id token_id token lemma pos entity text1 1 1 What what DET text1 1 2 kind kind NOUN text1 1 3 of of ADP text1 1 4 sports sport NOUN text1 1 5 do do AUX text1 1 6 you you PRON text1 1 7 like like VERB text1 1 8 ? ? PUNCT text2 1 1 Do do AUX text2 1 2 you you PRON }} !トークン化(単語ごとにバラバラにする):spacy_tokenize(txt) {{pre jpn501.token <- spacy_tokenize(jpn501) head(jpn501.token) $text1 [1] "What" "kind" "of" "sports" "do" "you" "like" "?" $text2 [1] "Do" "you" "like" "soccer" "," "base" "ball" "or" "swimming" [10] "?" $text3 [1] "There" "are" "many" "and" "variety" "sports" "around" "the" "world" "." $text4 [1] "A" "country" "has" "some" "traditional" "sports" "." $text5 [1] "Of" "course" "," "there" "are" "some" "traditional" [8] "sports" "in" "Japan" "." $text6 [1] "They" "are" "called" "\"" "BUDO" "\"" "." }} *出力をデータフレームにできる {{pre library(dplyr) spacy_tokenize(txt, remove_punct = TRUE, output = "data.frame") %>% tail() }} !「エンティティ」(固有名詞類)の抽出:entity_extract(parsedtxt) {{pre entity_extract(jpn501.parsed) doc_id sentence_id entity entity_type text5 1 Japan GPE text7 1 JYUDO GPE text7 1 KENDO ORG text9 1 REI ORG text16 1 BUDOJYO ORG text17 1 BUDOJYO ORG text18 1 REI ORG text28 1 REI ORG text30 1 Japanese NORP }} !名詞句抽出:nounphrase_extract(parsedtxt) *テキスト解析する際に、名詞句オプションを付けておく必要がある。 spacy_parse(txt, nounphrase = TRUE) {{pre jpn501.NP <- spacy_parse(jpn501, lemma = FALSE, entity = TRUE, nounphrase = TRUE) entity_extract(jpn501.NP) nounphrase_extract(jpn501.NP) doc_id sentence_id nounphrase text1 1 What_kind text1 1 sports text1 1 you text10 1 ,___"REI text10 1 the_feeling text10 1 you text10 1 people text10 1 who text10 1 the_game text11 1 example }} !名詞句をひと塊としてPOSタグ付与:nounphrase_consolidate(parsedtxt) {{pre nounphrase_consolidate(jpn501.NP) doc_id sentence_id token_id token pos text1 1 1 What_kind nounphrase text1 1 2 of ADP text1 1 3 sports nounphrase text1 1 4 do AUX text1 1 5 you nounphrase text1 1 6 like VERB text1 1 7 ? PUNCT text10 1 1 First ADV text10 1 2 ,_ _"REI nounphrase text10 1 3 " PUNCT }} !エンティティの一覧作成:spacy_extract_entity(txt) *文中の何単語目か(start_id)も表示 {{pre spacy_extract_entity(jpn501) doc_id text ent_type start_id length text5 Japan GPE 10 1 text7 JYUDO GPE 3 1 text7 KENDO ORG 5 1 text9 REI ORG 4 1 text10 First ORDINAL 1 1 text15 Secondly ORDINAL 1 1 text16 BUDOJYO ORG 9 1 text17 BUDOJYO ORG 11 1 text18 REI ORG 4 1 text28 REI ORG 21 1 }} !名詞句のみの一覧作成:spacy_extract_nounphrases(txt) {{pre spacy_extract_nounphrases(jpn501) doc_id text root_text start_id root_id length text1 What kind kind 1 2 2 text1 sports sports 4 4 1 text1 you you 6 6 1 text2 you you 2 2 1 text2 soccer soccer 4 4 1 text2 base ball ball 6 7 2 text2 swimming swimming 9 9 1 text3 many and variety sports sports 3 6 4 text3 the world world 8 9 2 text4 A country country 1 2 2 }} !!文法依存関係 !spacy_parse(テキスト, dependency = TRUE, output = "data.frame") *解析する際に、依存関係のオプションを付ける:dependency = TRUE *データフレームにしておく:output = "data.frame" {{pre jpn501.df <- spacy_parse(jpn501, dependency = TRUE, lemma = FALSE, pos = T, output = "data.frame") jpn501.df doc_id sentence_id token_id token pos head_token_id dep_rel entity text1 1 1 What DET 2 det text1 1 2 kind NOUN 7 dobj text1 1 3 of ADP 2 prep text1 1 4 sports NOUN 3 pobj text1 1 5 do AUX 7 aux text1 1 6 you PRON 7 nsubj text1 1 7 like VERB 7 ROOT text1 1 8 ? PUNCT 7 punct text2 1 1 Do AUX 3 aux text2 1 2 you PRON 3 nsubj }} !図にする:textplot *textplotパッケージを使用 **UDpipeを前提にしているので、修正必要 *データフレームの見出しposをuposに名称変更 *一文ずつなので、フィルターで特定の文を選ぶ {{pre library(textplot) library(dplyr) dplyr::rename(jpn501.df, upos = pos) %>% dplyr::filter(doc_id == "text1") %>% textplot_dependencyparser() }} {{ref_image jpn501.dep.png}} !!パターンマッチング matcher +初期化 matcher <- spacy_initialize_matcher() +パターンの定義 pattern <- list(パターンを定義) +テキスト解析を施しておく doc <- spacy_parse(テキスト) +パターンをmatcherに追加 spacy_add_matcher(matcher, "文字列", pattern) +matcherの実行 result <- spacy_matcher(doc, matcher) +結果の出力 print(result) !!終わる前にやっておく:spacy_finalize()