#how to calculate pcfg values for English sonority #sequencing with R # #two models, one dumb, one slightly less dumb # #mike hammond, LSA, 2011 #read in dictionary nd <- read.table('../../etexts/newdic',sep='\t',quote='') #pcfg number 1 (no positional effects!) #s -> o r #o -> 0 #o -> obs o #o -> liq o #o -> nas o #r -> v c #c -> 0 #c -> c obs #c -> c liq #c -> c nas #onset cases only: cases to consider (pcfg #1) #0/NULL o -> 0 #obs o -> obs o, o -> 0 #nas o -> nas o, o -> 0 #liq o -> liq o, o -> 0 #obs liq o -> obs o, o -> liq o, o -> 0 #obs nas o -> obs o, o -> nas o, o -> 0 #obs obs o -> obs p, o -> obs o, o -> 0 #obs obs liq o -> obs o, o -> obs o, o -> liq o, o -> 0 #segment classes defined #vowels and glides [wyaeiouIE@UOWY\\^RcxX] #obs [ptkbdgfsSvzZCJ] #nas [mn] #liq [lr] #restrict to monosyllables (not critical) nd1 <- nd[nd$V3=='S1',] rm(nd) #eliminate irrelevant detail nd1$V2 <- NULL nd1$V3 <- NULL nd1$V6 <- NULL #rename remaining columns names(nd1) <- c('ipa','spell','freq') #get rid of crazy negative values nd1$freq <- sapply(nd1$freq,function (x) { if (x < 0) x * -1 else x}) #total count total = sum(nd1$freq) #0/NULL count count.null <- sum(nd1[grep('^[wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #obs count count.obs <- sum(nd1[grep('^[ptkbdgfsSvzZCJ][wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #nas count count.nas <- sum(nd1[grep('^[mn][wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #liq count count.liq <- sum(nd1[grep('^[lr][wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #obs liq count count.obs.liq <- sum(nd1[grep('^[ptkbdgfsSvzZCJ][lr][wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #obs nas count count.obs.nas <- sum(nd1[grep('^[ptkbdgfsSvzZCJ][mn][wyaeiouIE@UOWY\\^RcxX]',nd1$ipa),3]) #obs obs count count.obs.obs <- sum(nd1[grep( '^[ptkbdgfsSvzZCJ][ptkbdgfsSvzZCJ][wyaeiouIE@UOWY\\^RcxX]', nd1$ipa),3]) #obs obs liq count count.obs.obs.liq <- sum(nd1[grep( '^[ptkbdgfsSvzZCJ][ptkbdgfsSvzZCJ][lr][wyaeiouIE@UOWY\\^RcxX]', nd1$ipa),3]) #how to total counts for each rule in pcfg #1 #o -> 0 total #o -> obs o obs + obs.liq + (2 * obs.obs) + (2 * obs.obs.liq) #o -> liq o liq + obs.obs.liq #o -> nas o nas + obs.nas #denominator for all probs in pcfg #1 count.all <- count.null + (2 * count.obs) + (2 * count.nas) + (2 * count.liq) + (3 * count.obs.liq) + (3 * count.obs.nas) + (3 * count.obs.obs) + (4 * count.obs.obs.liq) #probabilities for all rules in pcfg #1 prob.null <- total/count.all prob.obs <- (count.obs + count.obs.liq + (2 * count.obs.obs) + (2 * count.obs.obs.liq))/count.all prob.liq <- (count.liq + count.obs.obs.liq)/count.all prob.nas <- (count.nas + count.obs.nas)/count.all #probabilities for sample clusters per pcfg #1 #o -> obs o, o -> nas o, o -> liq o, o -> 0 bnr <- prob.obs * prob.nas * prob.liq * prob.null #o -> obs o, o -> nas o, o -> 0 bn <- prob.obs * prob.nas * prob.null #o -> liq o, o -> nas o, o -> obs o, o -> 0 rnb <- prob.liq * prob.nas * prob.obs * prob.null #o -> nas o, o -> obs o, o -> 0 nb <- prob.nas * prob.obs * prob.null #problems for pcfg #1 bnr == rnb bn == nb #a model that will work: pcfg #2 #s -> o r #s -> r #o -> obs o #o -> son o #o -> obs #o -> son #cases to consider for pcfg #2 #0/NULL s -> r #obs s -> o r, o -> obs #nas s -> o r, o -> son #liq s -> o r, o -> son #obs liq s -> o r, o -> obs o, o -> son #obs nas s -> o r, o -> obs o, o -> son #obs obs s -> o r, o -> obs o, o -> obs #obs obs liq s -> o r, o -> obs o, o -> obs o, o -> son #counts for each rule in #2 #smooth son-o case, otherwise it's 0! #s -> o r total - null #s -> r null #o -> obs o obs.liq + obs.nas + obs.obs + (2 * obs.obs.liq) #o -> son o 1 #o -> obs obs + obs.obs #o -> son nas + liq + obs.liq + obs.nas + obs.obs.liq #denominators for pcfg #2 #s-rules total #o-rules obs + nas + liq + (2 * obs.liq) + (2 * obs.nas) + # (2 * obs.obs) + (3 * obs.obs.liq) o2.denom <- count.obs + count.nas + count.liq + (2 * count.obs.liq) + (2 * count.obs.nas) + (2 * count.obs.obs) + (3 * count.obs.obs.liq) + 1 #probabilities for rules in pcfg #2 prob.sor2 <- (total - count.null)/total prob.sr2 <- count.null/total prob.oo2 <- (count.obs.liq + count.obs.nas + count.obs.obs + (2 * count.obs.obs.liq))/o2.denom prob.so2 <- 1/o2.denom prob.o2 <- (count.obs + count.obs.obs)/o2.denom prob.s2 <- (count.nas + count.liq + count.obs.liq + count.obs.nas + count.obs.obs.liq)/o2.denom #probabilities for sample clusters for pcfg #2 #LEAVE OUT s -> o r #s -> o r, o -> obs o, o -> son o, o -> son bnr2 <- prob.oo2 * prob.so2 * prob.s2 #s -> o r, o -> obs o, o -> son bn2 <- prob.oo2 * prob.s2 #s -> o r, o -> son o, o -> son o, o -> obs rnb2 <- prob.so2 * prob.so2 * prob.o2 #s -> o r, o -> son o, o -> obs nb2 <- prob.so2 * prob.o2 #comparisons for pcfg2 bn2 > nb2 bnr2 > rnb2 bnr2 < bn2 nb2 > rnb2