#how to calculate the pcfg values for English monosyllables with R #mike hammond, LSA, 2011 #this file includes how to get counts for the other rules in the pcfg #this intermingles R code, results, and discussion #copy the dictionary file #http://dingo.sbs.arizona.edu/~hammond/lsasummer11/newdic.txt #read in dictionary nd <- read.table('newdic',sep='\t',quote='') #counts for different numbers of syllables sum(nd[nd$V3=='S1',5]) sum(nd[nd$V3=='S2',5]) sum(nd[nd$V3=='S3',5]) sum(nd[nd$V3=='S4',5]) sum(nd[nd$V3=='S5',5]) sum(nd[nd$V3=='S6',5]) #there are two rules for syllables w -> s w -> s w #here are counts and which rules apply for each s1 477416 w->s s2 120440 w->sw w->s s3 40745 w->sw w->sw w->s s4 17727 w->sw w->sw w->sw w->s s5 4045 w->sw w->sw w->sw w->sw w->s s6 328 w->sw w->sw w->sw w->sw w->sw w->s #total number of tokens; count for w->s sum(nd$V5) w->s = 660716 #count for w->sw w->sw = 272931 = (1*120440) + (2*40745) + (3*17727) + (4*4045) + (5*328) #calculate probability for w->s p(w->s) = 0.7076722 = 660716/(660716+272931) #calculate probability for w->sw p(w->sw) = 0.2923278 = 272931/(660716+272931) #actual and then estimated probabilities for different word counts p(s1) = .72 = 477416/660716 .71 = p(w->s) p(s2) = .18 = 120440/660716 .20 = p(w->sw) * p(w->s) p(s3) = .06 = 40745/660716 .06 = p(w->sw) * p(w->sw) * p(w->s) p(s4) = .03 = 17727/660716 .02 = p(w->sw) * p(w->sw) * p(w->sw) * p(w->s) p(s5) = .006 = 4045/660716 .005 = p(w->sw) * p(w->sw) * p(w->sw) * p(w->sw) * p(w->s) p(s6) = .0005 = 328/660716 .001 = p(w->sw) * p(w->sw) * p(w->sw) * p(w->sw) * p(w->sw) * p(w->s) #calculate values for different syllable expansion rules #limited to (C)V(C) #estimated solely on monosyllables #all monosyllables nd1 <- nd[nd$V3=='S1',c(1,5)] #only (C)V(C) nd1[grep('^.?[aeiouIE@UOWY\\^RcxX].?$',nd1$V1),] #total number of CVC,CV,VC,V monosyllable tokens sum(nd1[grep('^.?[aeiouIE@UOWY\\^RcxX].?$',nd1$V1),2]) #V-initial items sum(nd1[grep('^[aeiouIE@UOWY\\^RcxX].?$',nd1$V1),2]) 87512 #obstruent-initial items sum(nd1[grep('^[ptkbdgfsSvzZCJ][aeiouIE@UOWY\\^RcxX].?$',nd1$V1),2]) 116665 #liquid-initial items sum(nd1[grep('^[lr][aeiouIE@UOWY\\^RcxX].?$',nd1$V1),2]) 12681 #nasal-initial items sum(nd1[grep('^[mn][aeiouIE@UOWY\\^RcxX].?$',nd1$V1),2]) 27639 #probabilities for onset rules p(O -> 0) = .36 = 87512/(87512+116665+12681+27638) p(O -> obs) = .48 = 116665/(87512+116665+12681+27638) p(O -> liq) = .05 = 12681/(87512+116665+12681+27638) p(O -> nas) = .11 = 27639/(87512+116665+12681+27638) #vowel-final items sum(nd1[grep('^.?[aeiouIE@UOWY\\^RcxX]$',nd1$V1),2]) 126728 #obstruent-final items sum(nd1[grep('^.?[aeiouIE@UOWY\\^RcxX][ptkbdgfsSvzZCJ]$',nd1$V1),2]) 120301 #liquid-final items sum(nd1[grep('^.?[aeiouIE@UOWY\\^RcxX][lr]$',nd1$V1),2]) 46276 #nasal-final items sum(nd1[grep('^.?[aeiouIE@UOWY\\^RcxX][mnG]$',nd1$V1),2]) 67350 #probability values for rhyme rules p(R -> V) = .35 = 126728/(126728+120301+46276+67350) p(R -> Vobs) = .33 = 120301/(126728+120301+46276+67350) p(R -> Vliq) = .13 = 46276/(126728+120301+46276+67350) p(R -> Vnas) = .19 = 67350/(126728+120301+46276+67350) #for VCV sequences we don't need all the rules, just #the onset and rhyme rules #V-obstruent-V sequences VO.V = p(R -> VO) * p(O -> 0) .12 (41%) = .33 * .36 V.OV = p(R -> V) * p(O -> O) .17 (59%) = .35 * .48 #V-nasal-V sequences VN.V = p(R -> VN) * p(O -> 0) .07 (58%) = .19 * .36 V.NV = p(R -> V) * p(O -> N) .05 (42%) = .35 * .11 #V-liquid-V sequences VL.V = p(R -> VL) * p(O -> 0) .05 (71%) = .13 * .36 V.LV = p(R -> V) * p(O -> L) .02 (29%) = .35 * .05