連語表現の一覧

I went to school yesterday.

I went
  went to
       to school
          school yesterday

単語が5つあると、
2単語の連続は５マイナス1個できる。

ファイルを窓を開いて選択し、2語表現(2-gram)の一覧を出力する

my2gram <- function(){
  
  ngramV <- c()                                           # ベクトルとしてまとめる
  
    lines.tmp <- scan(choose.files(), what="char", sep="\n")
    data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
    body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
    body.tmp <- body.tmp[body.tmp != ""] #空行を削除。
    body.tmp.lower <- tolower(body.tmp) # 小文字に
    token.tmp <- unlist(strsplit(body.tmp.lower, "\\W+")) # 単語に
    token.tmp = token.tmp[token.tmp !=""] # あるのだけに

    for (i in 1:(length(token.tmp)-1)) {                  # 単語数マイナス１
    #cat(token.tmp[i], token.tmp[i+1], "\n")               # i語目とi+1語目を出力
      tmp <- paste(token.tmp[i], token.tmp[i+1], collapse=" ") # pasteで二つの要素をつなげる
      ngramV <- c(ngramV, tmp)               # i語目とi+1語目を出力
    }
    ngramV
}

my2gram()

連語表現を並べ替えて頻度一覧表を作る

bigram.tmp <- my2gram()                                  # bigramの出力結果を保存

bigram.tmp　　　　　　　　　　　　　　　　　　　　　　　 # 中を見てみる

bigram.tmp.sorted <- sort(bigram.tmp)　　　　　　　　　　# 並べ替える

head(bigram.tmp.sorted)　　　　　　　　　　　　　　　　　# 並べ替えた結果の先頭部分を見てみる

table(bigram.tmp.sorted)　　　　　　　　　　　　　　　　 # 頻度一覧表にする

sort(table(bigram.tmp.sorted), decreasing = T)           # 多いもの順に並べ替える

head(sort(table(bigram.tmp.sorted), decreasing = T))     # 多いもの順に並べ替えたものの先頭部分を見る

フォルダー内のすべてのファイルを対象に、2語表現(2-gram)の一覧を出力する

my2gramFiles <- function(){
  
  files <- list.files()
  
  ngramFilesV <- c()
  
  for(j in files){
  
    ngramV <- c()                                           # ベクトルとしてまとめる
  
    #lines.tmp <- scan(choose.files(), what="char", sep="\n")                # ファイルを選ぶのではなく
    lines.tmp <- scan(j, what="char", sep="\n")                # list.filesからのファイルを一つずつ 
    data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
    body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
    body.tmp <- body.tmp[body.tmp != ""] #空行を削除。
    body.tmp.lower <- tolower(body.tmp) # 小文字に
    token.tmp <- unlist(strsplit(body.tmp.lower, "\\W+")) # 単語に
    token.tmp = token.tmp[token.tmp !=""] # あるのだけに

    for (i in 1:(length(token.tmp)-1)) {                  # 単語数マイナス１
        #cat(token.tmp[i], token.tmp[i+1], "\n")               # i語目とi+1語目を出力
      tmp <- paste(token.tmp[i], token.tmp[i+1], collapse=" ") # pasteで二つの要素をつなげる
      ngramV <- c(ngramV, tmp)               # i語目とi+1語目を出力
    }
    ngramFilesV <- c(ngramFilesV, ngramV)      # ファイルごとのデータを累積する
  }
ngramFilesV  
}

実行：母語話者データの場合

setwd("NICER1_3_2/NICER_NS")      # どちらか選ぶ
#setwd("NICER1_3_2/NICER_NNS")

bigramFiles <- my2gramFiles()

#head(bigramFiles)
head(sort(table(sort(bigramFiles)), decreasing = T))

## 
##    of the    in the     it is    to the     to be the world 
##       388       370       188       161       155       130

学習者データの場合

#setwd("NICER1_3_2/NICER_NS")      # どちらか選ぶ
setwd("NICER1_3_2/NICER_NNS")

bigramFiles <- my2gramFiles()

#head(bigramFiles)
head(sort(table(sort(bigramFiles)), decreasing = T))

## 
##       it is     i think      in the       don t high school       a lot 
##         533         467         353         329         288         280

ファイルを選んで、2連語で、上位10位まで

my2gramTop10 <- function(){
  
  ngramV <- c()                                           # ベクトルとしてまとめる
  
    lines.tmp <- scan(choose.files(), what="char", sep="\n")
    data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
    body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
    body.tmp <- body.tmp[body.tmp != ""] #空行を削除。
    body.tmp.lower <- tolower(body.tmp) # 小文字に
    token.tmp <- unlist(strsplit(body.tmp.lower, "\\W+")) # 単語に
    token.tmp = token.tmp[token.tmp !=""] # あるのだけに

    for (i in 1:(length(token.tmp)-1)) {                  # 単語数マイナス１
    #cat(token.tmp[i], token.tmp[i+1], "\n")               # i語目とi+1語目を出力
      tmp <- paste(token.tmp[i], token.tmp[i+1], collapse=" ") # pasteで二つの要素をつなげる
      ngramV <- c(ngramV, tmp)               # i語目とi+1語目を出力
    }
    ngramV

bigram.tmp <- ngramV　　　　　　　　　　　　　　　　　　# 中を見てみる

bigram.tmp.sorted <- sort(bigram.tmp)　　　　　　　　　　# 並べ替える

sort(table(bigram.tmp.sorted), decreasing = T)           # 多いもの順に並べ替える

head(sort(table(bigram.tmp.sorted), decreasing = T), 10)     # 多いもの順に並べ替えたものの先頭部分を見る

}

実行

my2gramTop10()

ファイルを選んで、2連語で、上位何位まで表示するか指定できるようにする

my2gramTopN <- function(a){                                 # function(a)で、a に「引数」をとる
  
  ngramV <- c()                                          
  
    lines.tmp <- scan(choose.files(), what="char", sep="\n")
    data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
    body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
    body.tmp <- body.tmp[body.tmp != ""] #空行を削除。
    body.tmp.lower <- tolower(body.tmp) # 小文字に
    token.tmp <- unlist(strsplit(body.tmp.lower, "\\W+"))
    token.tmp = token.tmp[token.tmp !=""] # あるのだけに

    for (i in 1:(length(token.tmp)-1)) {                  
    #cat(token.tmp[i], token.tmp[i+1], "\n")               
      tmp <- paste(token.tmp[i], token.tmp[i+1], collapse=" ") 
      ngramV <- c(ngramV, tmp)               
    }
    ngramV

bigram.tmp <- ngramV　　　　　　　　　　　　　　　　　

bigram.tmp.sorted <- sort(bigram.tmp)　　　　　　　　　

#sort(table(bigram.tmp.sorted), decreasing = T)          

head(sort(table(bigram.tmp.sorted), decreasing = T), a)     # 順位を a で設定する

}

実行

my2gramTopN(20)

ディレクトリー内のすべてのファイルを対象に、3連語で、指定した順位まで

my3gramFilesN <- function(a){
  
  files <- list.files()
  
  ngramFilesV <- c()
  
  for(j in files){
  
    ngramV <- c()                                          
  
    #lines.tmp <- scan(choose.files(), what="char", sep="\n")               
    lines.tmp <- scan(j, what="char", sep="\n")                 
    data.tmp <- grep("\\*(JPN|NS)...:\t", lines.tmp, value=T)
    body.tmp <- gsub("\\*(JPN|NS)...:\t", "", data.tmp)
    body.tmp <- body.tmp[body.tmp != ""]
    body.tmp.lower <- tolower(body.tmp) 
    token.tmp <- unlist(strsplit(body.tmp.lower, "\\W+")) 
    token.tmp = token.tmp[token.tmp !=""] 

    for (i in 1:(length(token.tmp)-2)) {                  # 単語数マイナス２

      tmp <- paste(token.tmp[i], token.tmp[i+1], token.tmp[i+2], collapse=" ") # pasteで３つの要素をつなげる
      ngramV <- c(ngramV, tmp)               # i語目とi+1語目とi+2語目を出力
    }
    ngramFilesV <- c(ngramFilesV, ngramV)      # ファイルごとのデータを累積する
  }
ngramFilesV

ngramFilesV.sorted <- sort(ngramFilesV)　

head(sort(table(ngramFilesV.sorted), decreasing = T), a)     # 順位を a で設定する


}

実行

#setwd("NICER1_3_2/NICER_NS")      # どちらか選ぶ
setwd("NICER1_3_2/NICER_NNS")

my3gramFilesN(10)

## ngramFilesV.sorted
##           a lot of       i think that junior high school         when i was 
##                262                149                121                109 
##          i want to          and so on       in the world            i don t 
##                107                 99                 94                 84 
##         one of the            i was a 
##                 76                 71

出現頻度の検定：カイ自乗検定

例：　副詞の生起位置

a. However, such men don't make good husbands.
b. Such men, however, don't make good husbands.
c. Such men don't, however, make good husbands.
d. Such men don't make good husbands, however.
　　　　　　　　　　　　　(Halliday, 1985: 82)

文頭・文中・文末のどこにでも置くことができる。
実際は、どうなのか、コーパスで観察する。

LOB コーパス中のhowever の生起位置を調べたところ、次のような結果になった。

文頭 109
文中 347
文末 8

はて、本当にどこに置いてもよいのか、それとも、「偏り」があるのか。
頻度の差が、偶然による誤差のうちなのか、誤差とは言えないのか、カイ自乗検定で調べる。

頻度に差があるか: chisq:test()

however.data <- c(109, 347, 8)

chisq.test(however.data)

## 
##  Chi-squared test for given probabilities
## 
## data:  however.data
## X-squared = 391.74, df = 2, p-value < 2.2e-16

2.2e-16 意味は、2.2×10-16 （つまりほとんどゼロ）
このバラつきは、偶然の誤差とは言えない。

文末は、少ないのでほとんど使わないとして、文頭と文中の使い分けに何か原則があるのだろうか？

Sugiura, M. (1991：平成 3年 3月 1日). The Distribution Environment of the Connective “However” and the Principle of Its Position —Based on the LOB Corpus—. 『中部大学女子短期大学紀要・言語文化研究』, 2, 47-63. https://ci.nii.ac.jp/lognavi?name=nels&lang=ja&type=pdf&id=ART0000872455&naid=110000483331

二次元の頻度差を考える。

イギリス英語とアメリカ英語で、 therefore の生起位置に違いがあるか。

位置　文頭　文中
  米　　38　53
  英　　15　96

行列データを作る：matrix()

c( , , , )の中のデータは、まず列(row)順に並べていく
nrow は行数、ncol は列数

therefore.data <- matrix(c(38,15,53,96), nrow=2, ncol=2)
therefore.data

##      [,1] [,2]
## [1,]   38   53
## [2,]   15   96

chisq.test(therefore.data)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  therefore.data
## X-squared = 19.179, df = 1, p-value = 1.19e-05

イギリス英語とアメリカ英語で、therefore の生起位置に有意な偏りがある

どこに差があるか：残差分析

群馬大学の青木先生のサイト

　　　http://aoki2.si.gunma-u.ac.jp/R/ 　　　 * 度数に関する検定 * カイ二乗分布を使用する独立性の検定と残差分析

source("http://aoki2.si.gunma-u.ac.jp/R/src/my-chisq-test.R", encoding="euc-jp")

my-chisq-test.Rというファイル名だが、関数名はmy.chisq.test()

my.chisq.test(therefore.data)

## 
##  カイ二乗分布を用いる独立性の検定（残差分析）
## 
## data:  therefore.data
## X-squared = 20.612, df = 1, p-value = 5.623e-06

result.ct <- my.chisq.test(therefore.data)

summary(result.ct)

## 調整された残差
##         [,1]    [,2]
## [1,]  4.5401 -4.5401
## [2,] -4.5401  4.5401
## 
## P 値
##            [,1]       [,2]
## [1,] 5.6231e-06 5.6231e-06
## [2,] 5.6231e-06 5.6231e-06

（結果の解釈例）
カイ二乗検定の結果より、イギリス英語とアメリカ英語で、therefore の生起位置に有意な偏りがある。残差分析の結果より、イギリス英語では、アメリカ英語に比べて、therefore を文中で用いる頻度が有意に高く、文頭で用いる頻度が有意に低いことが分かる。

対数尤度比検定：G^2

#カイ二乗検定よりも統計理論的には、対数尤度比検定を利用する方が望ましい。
（カイ二乗検定は対数尤度の近似、対数尤度比検定は対数尤度を直接扱う）

source("http://aoki2.si.gunma-u.ac.jp/R/src/G2.R", encoding="euc-jp")

G2(therefore.data)

## 
##  対数尤度比に基づく独立性の検定（G-squared test）
## 
## data:  therefore.data
## G-squared = 20.925, df = 1, p-value = 4.776e-06

パッケージ Deducer

install.packages("Deducer")

library(Deducer)

##  要求されたパッケージ ggplot2 をロード中です

##  要求されたパッケージ JGR をロード中です

##  要求されたパッケージ rJava をロード中です

##  要求されたパッケージ JavaGD をロード中です

## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.

##  要求されたパッケージ car をロード中です

##  要求されたパッケージ carData をロード中です

##  要求されたパッケージ MASS をロード中です

## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().

likelihood.test(therefore.data)

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  therefore.data
## Log likelihood ratio statistic (G) = 20.925, X-squared df = 1, p-value
## = 4.776e-06

サンプルサイズに関係ないオッヅ比

オッヅ比：Fisherの直接確率検定（正確確率検定）： fisher.test()
解釈の仕方：「１」が基準＝確率に差はない。１より大きければ、分子の方の確率が高い。

therefore.data

##      [,1] [,2]
## [1,]   38   53
## [2,]   15   96

fisher.test(therefore.data)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  therefore.data
## p-value = 9.958e-06
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  2.209169 9.784982
## sample estimates:
## odds ratio 
##   4.551871

イギリス英語で文頭にthereforeを置く確率よりも、アメリカ英語の方が4.6倍も文頭に置く確率が高い。

効果量

2x2の場合の効果量： φ (ファイ) 係数

install.packages("psych")

library(psych)

## 
##  次のパッケージを付け加えます: 'psych'

##  以下のオブジェクトは 'package:car' からマスクされています: 
## 
##      logit

##  以下のオブジェクトは 'package:ggplot2' からマスクされています: 
## 
##      %+%, alpha

phi(therefore.data)

## [1] 0.32

0.1 < 効果量小 < 0.3 < 効果量中 < 0.5 < 効果量大

2x2以上の場合

クラメールのV（Cramer’s V）
解釈の仕方：０から１の間

install.packages("lsr", dependencies = T, repos="http://cran.rstudio.com/")

library(lsr)

cramersV(therefore.data)

## [1] 0.3081308

0.1 < 効果量小 < 0.3 < 効果量中 < 0.5 < 効果量大

R for Learner Corpus Research 2021: 08

sugiura

2021/12/24

連語表現の一覧

ファイルを窓を開いて選択し、2語表現(2-gram)の一覧を出力する

連語表現を並べ替えて頻度一覧表を作る

フォルダー内のすべてのファイルを対象に、2語表現(2-gram)の一覧を出力する

実行：母語話者データの場合

学習者データの場合

ファイルを選んで、2連語で、上位10位まで

実行

ファイルを選んで、2連語で、上位何位まで表示するか指定できるようにする

実行

ディレクトリー内のすべてのファイルを対象に、3連語で、指定した順位まで

実行

出現頻度の検定：カイ自乗検定

例：　副詞の生起位置

LOB コーパス中のhowever の生起位置を調べたところ、次のような結果になった。

頻度に差があるか: chisq:test()

文末は、少ないのでほとんど使わないとして、文頭と文中の使い分けに何か原則があるのだろうか？

二次元の頻度差を考える。

行列データを作る：matrix()

どこに差があるか：残差分析

群馬大学の青木先生のサイト

対数尤度比検定：G^2

パッケージ Deducer

サンプルサイズに関係ないオッヅ比

効果量

2x2の場合の効果量： φ (ファイ) 係数

2x2以上の場合

R for Learner Corpus Research 2021: 08

sugiura

2021/12/24

連語表現の一覧

ファイルを窓を開いて選択し、2語表現(2-gram)の一覧を出力する

連語表現を並べ替えて頻度一覧表を作る

フォルダー内のすべてのファイルを対象に、2語表現(2-gram)の一覧を出力する

実行：母語話者データの場合

学習者データの場合

ファイルを選んで、2連語で、上位10位まで

実行

ファイルを選んで、2連語で、上位何位まで表示するか指定できるようにする

実行

ディレクトリー内のすべてのファイルを対象に、3連語で、指定した順位まで

実行

出現頻度の検定：カイ自乗検定

例： 副詞の生起位置

LOB コーパス中のhowever の生起位置を調べたところ、次のような結果になった。

頻度に差があるか: chisq:test()

文末は、少ないのでほとんど使わないとして、文頭と文中の使い分けに何か原則があるのだろうか？

二次元の頻度差を考える。

行列データを作る：matrix()

どこに差があるか：残差分析

群馬大学の青木先生のサイト

対数尤度比検定：G^2

パッケージ Deducer

サンプルサイズに関係ないオッヅ比

効果量

2x2の場合の効果量： φ (ファイ) 係数

2x2以上の場合

例：　副詞の生起位置