getwd()
list.files()
setwd("NICER1_3_2/NICER_NNS")
list.files()
. ピリオド一つは何でも一文字
* アスタリスクは直前の文字の0回以上の繰り返し
*? とすると「最短一致」
+ プラス記号は直前の文字の1回以上の繰り返し
+? とすると「最短一致」
? 疑問符は直前の文字(項目)の0回もしくは1回の繰り返し。つまり、あってもなくてもよい。
( ) 丸かっこで囲むとその中の文字列が連続したカタマリとして扱われる。
文字クラスを表す [ ] との違いに注意。
| は、その前後の文字のどちらか。( | ) で囲むと、その前後の文字列のどちらか。
例: (am|is|are)
^ は、行頭
$ は、行末
\w で半角英数とアンダースコア
\W \w以外のもの
\b 「単語」の境界
例: \bthe\b
[ ] で囲まれた「クラス」はその中のどれか一文字
例: [Hh]owever
[ - ] で、-の前後の連続する文字列の範囲 例:[a-z] は小文字aからzまで
例: [a-zA-Z]
例: [0-9]
例: [a-zA-Z0-9]
[^ ] と最初に ^ を置くと、そこに並ぶいずれの文字でもないもの、という意味(つまり否定)
[:space:] で半角スペース・タブ記号・改行記号
[:punct:] で句読点類
[:digit:] で数字
[:lower:] 小文字
[:upper:] 大文字
[:alpha:] 大文字・小文字
[:alnum:] で半角英数
#getwd()
setwd("NICER1_3_2/NICER_NNS")
#getwd()
jpn501 <- scan("JPN501.txt", what="char", sep="\n")
jpn502 <- scan("JPN502.txt", what="char", sep="\n")
jpn503 <- scan("JPN503.txt", what="char", sep="\n")
jpn504 <- scan("JPN504.txt", what="char", sep="\n")
jpn505 <- scan("JPN505.txt", what="char", sep="\n")
jpn506 <- scan("JPN506.txt", what="char", sep="\n")
jpn507 <- scan("JPN507.txt", what="char", sep="\n")
jpn508 <- scan("JPN508.txt", what="char", sep="\n")
jpn509 <- scan("JPN509.txt", what="char", sep="\n")
jpn510 <- scan("JPN510.txt", what="char", sep="\n")
#getwd()
setwd("NICER1_3_2/NICER_NS")
ns501 <- scan("NS501.txt", what="char", sep="\n")
ns502 <- scan("NS502.txt", what="char", sep="\n")
ns503 <- scan("NS503.txt", what="char", sep="\n")
ns504 <- scan("NS504.txt", what="char", sep="\n")
ns505 <- scan("NS505.txt", what="char", sep="\n")
ns506 <- scan("NS506.txt", what="char", sep="\n")
ns507 <- scan("NS507.txt", what="char", sep="\n")
ns508 <- scan("NS508.txt", what="char", sep="\n")
ns509 <- scan("NS509.txt", what="char", sep="\n")
ns510 <- scan("NS510.txt", what="char", sep="\n")
ns10 <-c(ns501, ns502, ns503, ns504, ns505, ns506, ns507, ns508, ns509, ns510)
jpn10 <-c(jpn501, jpn502, jpn503, jpn504, jpn505, jpn506, jpn507, jpn508, jpn509, jpn510)
grep("[hH]owever", ns10, value=T)
grep("there (is|are|was|were)", ns10, value=T, ignore.case = T)
grep("(is|was|are|were) \\w+? to ", ns10, value=T, ignore.case = T)
*to の後ろに半角スペースがある点に注意
grep("work(ed|ing|s)?", ns10, value=T, ignore.case = T)
head(ns501)
## [1] "@Begin" "@Participants:\tNS501" "@PID:\tPIDNS501"
## [4] "@Age:\t27" "@Sex:\tM" "@L1:\tAmE"
head(ns501, 10)
## [1] "@Begin"
## [2] "@Participants:\tNS501"
## [3] "@PID:\tPIDNS501"
## [4] "@Age:\t27"
## [5] "@Sex:\tM"
## [6] "@L1:\tAmE"
## [7] "@FatherL1:\tnone"
## [8] "@MotherL1:\tnone"
## [9] "@AcademicBackground:\tM1"
## [10] "@OtherLanguage:\tJapanese=0.7=good;none=="
tail(ns501)
## [1] "%COM:\t"
## [2] "*NS501:\tWith life spans ever increasing, we may live long enough to see our failures or our triumphs come to fruition."
## [3] "%COM:\t"
## [4] "*NS501:\tThank you for your time."
## [5] "%COM:\t"
## [6] "@End"
tail(ns501, 10)
## [1] "*NS501:\tWhat is a song without anyone to hear, or a painting for no one to see, the same most definitely applies here."
## [2] "%COM:\t"
## [3] "%par:"
## [4] "*NS501:\tThese are just a few thoughts about teaching styles and the importance of education on the future of humankind."
## [5] "%COM:\t"
## [6] "*NS501:\tWith life spans ever increasing, we may live long enough to see our failures or our triumphs come to fruition."
## [7] "%COM:\t"
## [8] "*NS501:\tThank you for your time."
## [9] "%COM:\t"
## [10] "@End"
tail(ns501, -90)
## [1] "%COM:\t"
## [2] "*NS501:\tWe in the scientific community must also blame ourselves."
## [3] "%COM:\t"
## [4] "*NS501:\tIt's just as important to share the knowledge that we find and make it approachable for even the most uninitiated laymen, because if we find this knowledge and take it with us to the grave then it all of our work would have been for nothing."
## [5] "%COM:\t"
## [6] "*NS501:\tWhat is a song without anyone to hear, or a painting for no one to see, the same most definitely applies here."
## [7] "%COM:\t"
## [8] "%par:"
## [9] "*NS501:\tThese are just a few thoughts about teaching styles and the importance of education on the future of humankind."
## [10] "%COM:\t"
## [11] "*NS501:\tWith life spans ever increasing, we may live long enough to see our failures or our triumphs come to fruition."
## [12] "%COM:\t"
## [13] "*NS501:\tThank you for your time."
## [14] "%COM:\t"
## [15] "@End"
body <- head(ns501, 27)
tail(body, 3)
## [1] "%COM:\t"
## [2] "*NS501:\tSince the time of hunters and gatherers, information has been passed down from one generation to the next, not only to preserve said knowledge, but in hopes to expand and build upon it."
## [3] "%COM:\t"
tail(head(ns501, 27), 3)
## [1] "%COM:\t"
## [2] "*NS501:\tSince the time of hunters and gatherers, information has been passed down from one generation to the next, not only to preserve said knowledge, but in hopes to expand and build upon it."
## [3] "%COM:\t"
ns501[6]
## [1] "@L1:\tAmE"
ns501[10]
## [1] "@OtherLanguage:\tJapanese=0.7=good;none=="
ns501[77:80]
## [1] "*NS501:\tThere was a time that those exploring the reaches of Science and Technology were the rock stars or idols of their time."
## [2] "%COM:\t"
## [3] "*NS501:\tNow, if only it were so."
## [4] "%COM:\t"
ns501[c(77, 79, 81)]
## [1] "*NS501:\tThere was a time that those exploring the reaches of Science and Technology were the rock stars or idols of their time."
## [2] "*NS501:\tNow, if only it were so."
## [3] "*NS501:\tI remember fondly as a child anxiously waiting the next episode of\"Bill Nye the Science Guy\" to take me into a world of molecules and atoms and perhaps grant me a new set of missives so that I may \"try it at home\"."
・単語リストを作るには、本文情報の部分のみを使う。
・データのフォーマットがどうなっているかを踏まえて、「処理」を行う。
・各データは、一文一行で、各行頭に「発話者コード」が入っている。
学習者データは、例えば、JPN501: となっている。
語簿話者データは例えば、NS501: となっている。
・必要な行だけ取り出すにはどうしたらよいか。
=>母語話者データなら、NS501という文字列が入っている行を取り出せばよいはず。
grep("NS501", ns501, value=T)
●結果を見てみると一つ目と二つ目の要素は該当しない(ヘッダーの説明)ものなので不要
[1] "@Participants:\tNS501"
[2] "@PID:\tPIDNS501"
=> 上の二行を削除
tailの応用で削除
●母語話者データのデータ部分のみ抽出(抽出した結果を新しい変数 ns501.data に入れる)
ns501.data <- tail(grep("NS501", ns501, value=T), -2)
head(ns501.data)
## [1] "*NS501:\tLearning Styles and Why Education is Important!"
## [2] "*NS501:\tThe ultimate gift one can bestow on another is the gift of knowledge."
## [3] "*NS501:\tSince the time of hunters and gatherers, information has been passed down from one generation to the next, not only to preserve said knowledge, but in hopes to expand and build upon it."
## [4] "*NS501:\tThis is why the utmost care must be placed on how this information flows from one generation to the next."
## [5] "*NS501:\tDepending on what country that you come from, the learning styles tend to change completely."
## [6] "*NS501:\tInitially when I entered college, I too wished to teach those who would inherit this planet."
strsplit(ns501.data, " ")
* ns501.dataをスペース で切って行単位の中で、単語ごとにバラバラにする
* バラバラにした結果は「リスト」に入る(2次元データ)(各行の中で単語単位になっている)
[[1]]
[1] [2] ...
[[2]]
[1] [2] ...
[[3]]
[1] [2] ...
ns501.data.list <- strsplit(ns501.data, " ")
unlist(ns501.data.list)
tmp <- unlist(ns501.data.list)
sort(tmp)
sort(unlist(ns501.data.list))
ns501.token <- sort(unlist(ns501.data.list))
(★UNIXのコマンド uniq とつづりが違うので注意)
unique(ns501.token)
ns501.type <- unique(ns501.token)
――――――――――――――――――――――――――――――――――――――――
これで一応「単語リスト」はできるが、できたものをよく見てみよう。
これでよいかな?
――――――――――――――――――――――――――――――――――――――――
●削除するとは、、、何もなしで置き換える
gsub("\\*NS501:\\t", "", ns501.data)
●大文字はすべて小文字に統一
ns501.data2 <- tolower(ns501.data)
gsub("[[:punct:]]", " ", ns501.data2)