require(vcd) require(cluster) require(psy) # read data data.all <- read.table("ske-results-R.csv", header=T, sep=",") # build sums and select only those with a minimum number of occurrences data.all.sums <- (data.all[,2] + data.all[,3] + data.all[,4]) data <- subset(data.all, data.all.sums > 25) # normalize counts data.sums <- (data[,2] + data[,3] + data[,4]) data.norm <- data data.norm[,2] <- data[,2] / data.sums data.norm[,3] <- data[,3] / data.sums data.norm[,4] <- data[,4] / data.sums #lemma <- data$lemma attach(data) threshold <- 25 pthres <- .90 # high percentage infinitives with a lot of ingforms inf.undecided <- subset(data, data.norm$infinitive >= pthres & data$prep.ingform + data$ingform >= threshold) inf.undecided$statcat <- "undecided" # without a lot of ingforms inf.certain <- subset(data, data.norm$infinitive >= pthres & data$prep.ingform + data$ingform < threshold) inf.certain$statcat <- "infinitive" # high percentage ingforms with a lot if infinitives ing.undecided <- subset(data, data.norm$prep.ingform + data.norm$ingform >= pthres & data$infinitive >= threshold) ing.undecided$statcat <- "undecided" ing.certain <- subset(data, data.norm$prep.ingform + data.norm$ingform >= pthres & data$infinitive < threshold) ing.certain$statcat <- "gerund" both <- subset(data, data.norm$prep.ingform + data.norm$ingform < pthres & data.norm$infinitive < pthres) both$statcat <- "both" # merge all classified subsets into a large data frame with classification # column $statcat data.classified <- merge(inf.undecided, inf.certain, all.x=T,all.y=T) data.classified <- merge(data.classified, ing.certain, all.x=T,all.y=T) data.classified <- merge(data.classified, ing.undecided, all.x=T,all.y=T) data.classified <- merge(data.classified, both, all.x=T,all.y=T) # compute Cohen's Kappa between grammar book rating and our classification k.stat <- ckappa(data.frame(data.classified$grammarbook, data.classified$statcat)) # compute sinplified normalized values with ingform columns combined #data.norm.simple <- data.frame(data.norm[,1], data.norm[,2], data.norm[,3] + data.norm[,4]) #data.simple <- data.frame(data[,1], data[,2], data[,3] + data[,4]) X11() ternaryplot( data[,2:4], pch= 1, id_color="darkblue", id= lemma, cex = .1, dimnames = c("to+infinitive","gerund", "preposition+gerund"), main = "Verbs used with to-infinitive vs. gerund", grid=T ) X11() ternaryplot( data[,2:4], id_color="darkblue", cex = .4, dimnames = c("to+infinitive","gerund", "preposition+gerund"), main = "Verbs used with to-infinitive vs. gerund", grid=T ) postscript(file="ternary-plot1.ps") ternaryplot( data[,2:4], id_color="darkblue", cex = .8, dimnames = c("to+infinitive","gerund", "preposition+gerund"), main="", grid=T ) dev.off() #X11() #clustering1.diss <- daisy(data.norm[,2:4]) #clustering1.clus <- pam(clustering1.diss, 3, diss = TRUE)$clustering #clusplot(clustering1.diss, clustering1.clus, diss = TRUE, shade = TRUE) #X11() #clustering1.diss <- daisy(data.norm.simple[,2:3]) #clustering1.clus <- pam(clustering1.diss, 2, diss = TRUE)$clustering #clusplot(clustering1.diss, clustering1.clus, diss = TRUE, shade = TRUE, main="Verbs used with #to-infinitive vs. gerund") # selecting subsets # subset(data,data.norm$infinitive > 0.9 & data$ingform < 100)