## To execute any commmand, highlight it with the mouse and hit Control r.

##########################################################################
## DO THIS ALWAYS AT START
##########################################################################

##If your computer doesn't have these, go to the Packages menu, pick a CRAN mirror site
##and download the necessary "packages"; i.e. languageR and arm.
##Then run these commands, as above, to get them into your computer's memory.

library(languageR)
library(arm)

##Set working directory.  Note:  R recognizes only forward slashes, so
##just copying from Windows will not suffice!


setwd("C:/Dropbox/AR/TheCoinagesInSeuss/Phonesthemes/Analysis/MarchandModels/R")
 
##########################################################################
## Deal with your data file
##########################################################################

## You want a plain-text input file where all the columns are labeled, and the separator of columns is
## a tab.  

##WARNINGS ABOUT INPUT FILE FORMAT

##These issues can cause huge amounts of wasted time!

##CAUTION:  Column headers should be extremely plain; it's best to use nothing but letters (you can
##  also use noninitial digits.  
##CAUTION:  The same is true of your data/candidates:  it is wise to use nothing but letters and numbers.
##CAUTION:  R is case-sensitive; always check variable names with care.
##CAUTION:  apostrophes anywhere in your file will create chaos; remove or replace them before proceeding.

##INFO

## Here is the command to read a data file.  Of course, you have to change it to match the file you have.
## sep="t" is needed so that it will assume that tab is the column separator.

##I am now working with two files, full and simple.
##Simple is actually longer, because I kept in it the source constraints

MyData=read.table("CoarseMarchandRFile.txt", header=TRUE, sep="\t")
MyData=read.table("FineMarchandRFile.txt", header=TRUE, sep="\t")



##########################################################################
## Eyeballing your data with contingency tables
##########################################################################

## This command gives you a list of all the variables you have available from your file to work with -- i.e. the column headers.
colnames(MyData)


## Contingency tables.
## Put the dependent variable first.
## In the schema, Line 1 performs the calculation, Line 2 gives raw counts, Line 3 the proportions, and Line 4 the chi-square.

## I really need to learn how to do this with all of the variables at once.

## ALWAYS DO THIS FIRST.  (Though recently I've forgotten and it seems to be ok.
attach(MyData)

## Dependent variable with one independent variable.  The code further down creates a graph.
writeLines("") 
MyTable = xtabs( ~ SuffixIden + WeightToStress)

MyTable = xtabs( ~ SuffixIden + StressHigh)
MyTable
round(prop.table(MyTable), digits = 3)
summary(MyTable)

##This makes a table with the relative proportions.
PropTable = prop.table(MyTable, 2)
round(PropTable, digits=3)
ftable(PropTable)
barplot(PropTable)

##Try this plotting method:
Gobble.model.2<-lm(Sleepyyy~Turkeyyy, data=dataset.gobble2)
summary(Gobble.model.2)
plot(TurkeyTime, NapTime, main="Scatterplot of Thanksgiving", 
    xlab="Turkey Consumption in Grams ", ylab="Sleep Time in Minutes ", pch=19)
Source:
https://ademos.people.uic.edu/Chapter12.html

##########################################################################
## Logistic regression
##########################################################################

## Here is how you set up the logistic regression model.
## For linguistics, the best r function for logistic regression is probably bayesglm().
## This is because there are often exceptionless principles--
##   you don't want the weights to go sky high without good justification.
##   bayesglm() employs a prior to enforce this principle
## The reference source for bayesglm() is http://www.stat.columbia.edu/~gelman/research/unpublished/priors7.pdf.
## If you want, you can leave out the word "bayes" in this command and get classical glm instead.

#########################################################
##---SEUSS MODELING---
#########################################################

   ##You can look at the column names with this command:
      colnames(MyData)

   ## You can use this command to check the content of the first few lines.  Add n = number to get a different number of lines.
      head(MyData)
   ## If you're having trouble, the following is useful for debugging bad input files -- it makes a printout of what R thinks 
   ## it's working with.
      write.table(MyData, file = "Debug.txt", sep="\t")


##Repeated for convenience.
colnames(MyData)

##Here are all the constraints in the prototype coarse model.
MyModel = bayesglm(Status ~ MarchandOnset +
MarchandRhyme +
MarchandCoda +
German +
Phonotactic +
Metrical, data = MyData, family="binomial")
summary(MyModel)
##The step system, for trimming back useless constraints.
step <- stepAIC(MyModel)
step
##This trims back MarchandCoda, giving:

##Intercept)  MarchandOnset  MarchandRhyme         German    Phonotactic  
##      -4.8682         0.9197         1.1517         3.4510         4.9075  
##     Metrical  
##       1.2279  

##Degrees of Freedom: 18179 Total (i.e. Null);  18174 Residual
##Null Deviance:      4114 
##Residual Deviance: 3765         AIC: 3777

##Let's get the full details on this model, which seems like the right one to present:
MyModel = bayesglm(Status ~ MarchandOnset +
MarchandRhyme +
German +
Phonotactic +
Metrical, data = MyData, family="binomial")
summary(MyModel)

##Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
##(Intercept)    -4.8682     0.1410 -34.517  < 2e-16 ***
##MarchandOnset   0.9197     0.1449   6.347 2.20e-10 ***
##MarchandRhyme   1.1517     0.1030  11.183  < 2e-16 ***
##German          3.4510     0.3990   8.649  < 2e-16 ***
##Phonotactic     4.9075     0.5800   8.461  < 2e-16 ***
##Metrical        1.2279     0.1919   6.398 1.57e-10 ***
##---
##Signif. codes:  0 *** 0.001 ** 0.01 * 0.05 . 0.1   1
##
##(Dispersion parameter for binomial family taken to be 1)
##
##    Null deviance: 4114.4  on 18179  degrees of freedom
##Residual deviance: 3765.4  on 18174  degrees of freedom
##AIC: 3777.4

##Simpler still is to take all the Marchand constraints as one:
MyModel = bayesglm(Status ~ Marchand +
German +
Phonotactic +
Metrical, data = MyData, family="binomial")
summary(MyModel)
step <- stepAIC(MyModel)
step
##Not so good, AIC is 3848.7


##Fine Marchand Model.  Try every one of the Marchand constraints separately.

MyData=read.table("FineMarchandRFile.txt", header=TRUE, sep="\t")
colnames(MyData)

MyModel = bayesglm(Status ~ Coda_M +
Coda_N +
Coda_NG +
Coda_R +
Coda_S +
Coda_Z +
Init_P +
Init_PL +
Init_PR +
Init_B +
Init_SP +
Init_SPL +
Init_SPR +
Init_BL +
Init_BR +
Init_T +
Init_D +
Init_DR +
Init_ST +
Init_STR +
Init_TR +
Init_K +
Init_KL +
Init_KR +
Init_SK +
Init_SKR +
Init_G +
Init_GL +
Init_GR +
Init_F +
Init_FL +
Init_FR +
Init_TH +
Init_THR +
Init_SW +
Init_W +
Init_HH +
Init_KW +
Init_M +
Init_SKW +
Init_TW +
Init_SN +
Init_R +
Init_SL +
Init_CH +
Init_S +
Init_SH +
Init_Z +
Init_JH +
Init_Y +
Rhyme_AA1K +
Rhyme_AA1P +
Rhyme_AA1T +
Rhyme_AA2K +
Rhyme_AA2P +
Rhyme_AA2T +
Rhyme_AE1P +
Rhyme_AE2P +
Rhyme_AH1K +
Rhyme_AH1T +
Rhyme_AH2T +
Rhyme_EH1K +
Rhyme_EH1P +
Rhyme_EH1T +
Rhyme_EH2K +
Rhyme_EH2T +
Rhyme_IH1K +
Rhyme_IH1P +
Rhyme_IH1T +
Rhyme_IH2K +
Rhyme_IH2T +
Rhyme_UH1K +
Rhyme_UH1T +
Rhyme_AA1SH +
Rhyme_AA2SH +
Rhyme_AH0L +
Rhyme_AH1SH +
Rhyme_EH1SH +
Rhyme_IH1SH +
Rhyme_UH1SH +
Rhyme_UH2SH +
Rhyme_AA1B +
Rhyme_AA1D +
Rhyme_AA1G +
Rhyme_AA2B +
Rhyme_AA2D +
Rhyme_AA2G +
Rhyme_AE1B +
Rhyme_AE1D +
Rhyme_AE1G +
Rhyme_AE1KT +
Rhyme_AE2B +
Rhyme_AE2D +
Rhyme_AE2G +
Rhyme_AH1B +
Rhyme_AH1D +
Rhyme_AH1G +
Rhyme_AH2G +
Rhyme_EH1B +
Rhyme_EH1D +
Rhyme_EH1G +
Rhyme_EH2D +
Rhyme_IH1B +
Rhyme_IH1D +
Rhyme_IH1G +
Rhyme_UW1 +
Rhyme_AA1R +
Rhyme_AA2R +
Rhyme_AE1K +
Rhyme_AE1M +
Rhyme_AE1MP +
Rhyme_AE1NG +
Rhyme_AE1NGK +
Rhyme_AE1SH +
Rhyme_AE2K +
Rhyme_AE2N +
Rhyme_AE2NG +
Rhyme_AE2SH +
Rhyme_AE1T +
Rhyme_AE2T +
Rhyme_AO1L +
Rhyme_AO2L +
Rhyme_ER1 +
Rhyme_ER1L +
Rhyme_ER1T +
Rhyme_ER2 +
Rhyme_ER2T +
Rhyme_EY2N +
Rhyme_IH1F +
Rhyme_IH1NG +
Rhyme_IH1NGK +
Rhyme_IY1K +
Rhyme_AA1JH +
Rhyme_AA2JH +
Rhyme_IH1CH +
Rhyme_IH1S +
Rhyme_IH1SK +
Rhyme_IH1Z +
Rhyme_IH2CH +
Rhyme_OW1L +
Rhyme_OW1N +
Rhyme_OW2L +
Rhyme_OW2N +
Rhyme_AA1MP +
Rhyme_AA1NGK +
Rhyme_AH1JH +
Rhyme_AO1F +
Rhyme_AO1R +
Rhyme_AO2F +
Rhyme_AO2R +
Rhyme_AW1CH +
Rhyme_AW1L +
Rhyme_AW1NS +
Rhyme_UW1M +
Rhyme_UW1N +
Rhyme_UW1P +
Rhyme_UW2M +
Rhyme_UW2N +
Rhyme_UW2P +
Rhyme_AH1F +
Rhyme_AH1M +
Rhyme_AH1MP +
Rhyme_AH1NCH +
Rhyme_AH1NGK +
Rhyme_AH1Z +
Rhyme_AH2Z, data = MyData, family="binomial")
summary(MyModel)
##We can't step on this; it's too big.
step <- stepAIC(MyModel)
step
##See my spreadsheet for the outcome.  AIC is 3497.7.

##Let us now take the cream of this crop, i.e. weight of magnitude greater than one, 
##p < .001, and see how it does.  I did not pursue this line in the paper.
MyModel = bayesglm(Status ~ 
Init_Z +
Init_SN +
Rhyme_AA1P +
Rhyme_AH1MP +
Rhyme_AH1D +
Rhyme_AH1M +
Rhyme_AH1B +
Init_BL +
Rhyme_UW1N +
Init_GL +
Rhyme_EH1K +
Init_SKR +
Rhyme_UW1 +
Init_G +
Init_Y +
Init_KW +
Init_GR +
Init_W +
Init_FL +
Init_B +
Init_JH +
German +
Phonotactic +
Metrical, data = MyData, family="binomial")
summary(MyModel)
##Much better on AIC:  3343.8
##I don't think we learn much from this not learned from the Full Model, 
##which has a better AIC.  So this is not going into the paper.

------------------------

## This command merely reports the weights that were found:
MyModel

## This one is nicer, because it also gives you a significance test for each weight:
summary(MyModel)

## Print out the model's predictions.
## This next line uses the actual formula for logistic regression to create probabilities, 
## and put the computed probabilities into a new column in MyData.
MyData$Prediction <- exp(predict(MyModel)) / (1 + exp(predict(MyModel)))
## Print the result out as a tab-delimited file.
## The bit with col.names is to compensate for a bug in R; it will otherwise misalign your column names.
write.table(MyData, sep="\t", file = "ModelPredictions.txt", col.names=NA)

## Make a spreadsheet of the grammar.
idx <- coef(summary(MyModel))   
idx          
MyConstraints = round(idx, digits=3)##
write.table(MyConstraints, sep="\t", file = "ConstraintsAndWeights.txt", col.names=NA)

##########################################################################
## The likelihood ratio test
##########################################################################

## The likelihood ratio test

library(car)
Anova(MyModel, type=2)
warnings()












 