Tuesday, January 31, 2017

Data and Code for anti-Shia violence analysis

I wanted to provide the data and the R code for the anti-Shia violence data that I analyzed a while back. Details about the dataset including where to access it from is provided here and here.  Here is the R code that I used to analyze this data:

setwd("/home/sarah/Documents/Interests (2)/Interests/sectarian")
library(doBy)
library(party)
x<- font="">read.csv("seckillingscity.csv")
y<- font="">read.csv("y.csv")
str(x) #checking to see if all the data is in the correct format
str(y)
y$injured[is.na(y$injured)]<- font="">0
y$killed[is.na(y$killed)]<- font="">0
y$Province[566]<- font="">"Sindh"


residfitplot <- font=""> function (model, col="black") {
  f<- font="">-fitted(model)
  r<- font="">-residuals(model, type='pearson')
  plot(f, r, col=col, main='Residuals vs. Fitted')
  L1<- font="">loess(r~f)
  fvec = seq(min(f),max(f),length.out=150)
  lines(fvec,predict(L1,fvec),col=2)
}

#scale-location plot for GLMMADMB object
locscaleplot <- font=""> function(model,col="black") {
  f <- font=""> fitted(model)
  r <- font=""> sqrt(abs(residuals(model, type='pearson')))  
  plot(f,r,col=col, main='Scale-Location Plot') 
  L1 <- font=""> loess(r~f)
  fvec = seq(min(f),max(f),length.out=150)
  lines(fvec,predict(L1,fvec),col=2)
}
#summing the attack and killed data for each year
sumyear1<- font="">summaryBy(killed + injured ~ Year, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict" | tag == "sectarian conflict"), FUN = function(x) sum(x))
attacks1<- font="">summaryBy(Date ~ Year, data = subset(y, tag == "Shia killing" | tag == "Shia blast"| tag == "sectarian conflict" | tag == "sectarian conflict" & Province != "Afghanistan"), FUN = function(x) length(x))

# @@ RGEDIT LANDMARK @@: combining attack and killed/injured data

sumyearN<- font="">cbind(sumyear1, attacks1[,2])
colnames(sumyearN)<- font="">c("year", "killed", "injured", "attacks")

sumyearN[14,]<- font="">cbind(2014, 222, 289, 54) #adding data from Pakistan fact sheet

#looking at the means, median and the IQR
summary(sumyearN)

# @@ RGEDIT LANDMARK @@: subset data by province
p<- font="">summaryBy(killed + injured ~ Year + Province, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict"), FUN = function(x) sum(x))
pattacks<- font="">summaryBy(Date ~ Year + Province, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict"), FUN = function(x) length(x))
province<- font="">cbind(p, pattacks[,3])
colnames(province)<- font="">c("Year", "Province", "Killed", "Injured", "Attacks") 

# @@ RGEDIT LANDMARK @@: subset data by city
c<- font="">summaryBy(killed + injured ~ Year + City, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict"), FUN = function(x) sum(x))
city<- font="">summaryBy(killed + injured ~ City, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict"), FUN = function(x) sum(x))
city2<- font="">city[order(city$killed), ]
colnames(c)<- font="">c("Year", "City", "Killed", "Injured")
cattacks<- font="">summaryBy(killed + injured ~ Year + City, data = subset(y, tag == "Shia killing" | tag == "Shia blast" | tag == "sectarian conflict"), FUN = function(x) length(x))
city<- font="">cbind(c, cattacks[,3])
colnames(city)<- font="">-c("Year", "City", "Killed", "Injured", "Attacks")

# @@ RGEDIT LANDMARK @@: General anti-Shia violence trends
#plotting the relationship between attacks and year
pdf("Attacksperyear.pdf")
plot(attacks ~ year, data = sumyearN, xlab = "Year", ylab = "Number of attacks", type = 'l', pch = 16, cex = 1.5, cex.lab = 1.5, cex.axis = 1.25, col = "#6E8CD3", lwd = 3)
dev.off()
#cleary, not very normal. resid fit plot looks awful and scale-location plots shows that variance is not consistent, but increasing)
#let's try a poisson regression, because this is count data and count data is generally not poisson distributed
glm1<- font="">glmmadmb(attacks ~ year, data = sumyearN, family = 'poisson', link = 'log')
glm1.1<- font="">glmmadmb(attacks ~ year, data = sumyearN, family = 'nbinom1', link = 'log')
glm1.2<- font="">glmmadmb(attacks ~ year, data = sumyearN, family = 'nbinom', link = 'log')
ICtab(glm1, glm1.1, glm1.2, type = "AIC")
#quasi poisson is best model
#model selection via log likelihood ratio tests
glm1.1<- font="">glm(attacks ~ year, data = sumyearN, family = quasipoisson('log'))
plot(glm1.1)
#residual versus fitted plot is not bad, but scale location plot is not good
#calculating overdispersion for the relationship

phi.hat=sum((residuals(glm1.1, type = "pearson"))^2)/glm1.1$df.residual
#phi.hat is 15.17. The model is very overdispersed. 
#Switching to negative binomial
glm1.21<- font="">glmmadmb(attacks ~ 1, data = sumyearN, family = 'nbinom', link = 'log')
anova(glm1.2, glm1.21)
#year is important predictor of attack
#plotting model onto plot of attack versus year

residfitplot(glm1.2) #looks okay
locscaleplot(glm1.2) #looks okay 
disperse(glm1.2, data = sumyearN)
#dispersion is 1.33, model is not over-dispersed. 
summary(glm1.2)

#year is a significant predictor of attacks. 
svg("attacksperyear.svg")
plot(attacks ~ year, data = sumyearN, xlab = "Year", ylab = "Number of attacks", type = 'l', pch = 16, cex = 2, cex.lab = 1.25, cex.axis = 1, col = "#6E8CD3", lwd = 3)
pred<- font="">seq(from = min(sumyearN$year), to=max(sumyearN$year), length = 14)
data=data.frame("attacks" = 0, "year" = pred)
curve<- font="">predict(glm1.2, data, type = "response")
lines(pred, curve, lwd = 3, lty =2,  col = "black")
dev.off()
#looks ok
#siginficant relationship according to summary
#making regression tree to examine threshold in attacks in data
library(party)
tree1<- font="">ctree(attacks ~ year, data = sumyearN, control = ctree_control(minsplit = 3)) 
png("attacktree.png")
plot(tree1)
dev.off()
#shifts in attacks per year before and after 2007
#exmaining relationship between killed and year
plot(killed ~ year, data = sumyearN, xlab = "Year", ylab = "Number of deaths", pch = 16, cex = 2, cex.lab = 1.5, cex.axis = 1.25, col = "#6E8CD3")
#seems to be a positive relationship between year and number killed
#Trying negative binomial GLMs
glm2<- font="">glm(killed ~ year, family = poisson(link ='log'), data = sumyearN)
glm2.1<- font="">glmmadmb(killed ~ year, family = 'nbinom', data = sumyearN)
glm2.2<- font="">glmmadmb(killed ~ year, family = "nbinom1", link = "log",  data = sumyearN)
ICtab(glm2, glm2.1, glm2.2, type = "AICc")
#negative binomial is the best model
#model selection
glm2.21<- font="">glmmadmb(killed ~ 1, family = "nbinom1", link = "log",  data = sumyearN)
anova(glm2.21, glm2.1)
#year is an important predictor of the number of people killed
residfitplot(glm2.2) #looks okay
locscaleplot(glm2.2) #looks okay
disperse(glm2.2, data = sumyearN)
#overdispersed 7.88
#switching to negative binomial
glm2.11<- font="">glmmadmb(killed ~ 1, family = "nbinom", link = "log",  data = sumyearN)
anova(glm2.1, glm2.11)
#year is an important predictor
residfitplot(glm2.1) #looks okay
locscaleplot(glm2.1) #looks okay
disperse(glm2.1, data = sumyearN)
#underdispersed, 0.70
#plotting model onto data
svg("killedperyear.svg")
plot(killed ~ year, data = sumyearN, xlab = "Year", ylab = "Number of deaths",  type = 'l', pch = 16, cex = 2, cex.lab = 1.25, cex.axis = 1, col = "#6E8CD3", lwd =3)
pred<- font="">seq(from = min(sumyearN$year), to=max(sumyearN$year), length = 14)
data=data.frame("killed" = 0, "year" = pred)
curve<-predict(glm2.1, data, type = "response")
lines(pred, curve, lwd = 3, lty =2, col = "black")
dev.off()
#Making a regression tree to examine thresholds in data
tree2<- font="">ctree(killed ~ year, data = sumyearN, control = ctree_control(minsplit = 3)) 
png("killedtree.png")
plot(tree1)
dev.off()
#shift before and after 2007
#killed per attack
#Plotting killed per attack for each year
plot(killed/attacks ~ year, data = sumyearN, xlab = "Year", ylab = "Number of deaths", type ='l', pch = 16, cex = 2, cex.lab = 1.25, cex.axis = 1, col = "#6E8CD3", lwd = 3)
lm2<- font="">glm(killed/attacks ~ year, data = sumyearN)
#looking at residual versus fitted and scale-location plots. 
#Residual fitted is ok. Scale location plot is not great
plot(lm2)
#using the dredge function in MuMIn package to determine the best model
library(MuMIn)
dredge(lm2)
#lookis like null model is best
#since the plot looks like a non-linear relationship switching to regression trees
tree3<- font="">ctree(killed/attacks ~ year, data = sumyearN, control = ctree_control(minsplit = 3)) 
png("killedattack.png")
plot(tree3)
dev.off()
#again 2007
svg("killedattacksperyear.svg")
plot(killed/attacks ~ year, data = sumyearN, xlab = "Year", ylab = "Number of deaths", pch = 16, cex = 2, cex.lab = 1.25, cex.axis = 1, col = "#6E8CD3")
pred<- font="">seq(from = min(sumyearN$year), to=max(sumyearN$year), length = 14)
data=data.frame("killed" = 0, "year" = pred)
curve<- font="">predict(glm2.1, data, type = "response")
lines(pred, curve, lwd = 1, col = "black")
dev.off()

# @@ RGEDIT LANDMARK @@: Province Analysis
#making a plot of attacks by province
png("AttacksProvince.png")
plot(Attacks ~ Year, data = province, type = 'n', axes = TRUE, xlab = "Year", ylab = "Number of Attacks", cex.lab = 1.5, cex.axis = 1.25, bty = "l", ylim = c(0,150))
lines(attacks ~ year, data = subset(sumyearN, year<= 2013), col = "black", lwd = 3)
lines(Attacks ~ Year, data = subset(province, Province == "Sindh"), col = '#97BDAF', lwd = 3)
lines(Attacks ~ Year, data = subset(province, Province == "Punjab"), col = '#C453A4', lwd = 3)
lines(Attacks ~ Year, data = subset(province, Province == "Baluchistan"), col = '#C46643', lwd =3)
lines(Attacks ~ Year, data = subset(province, Province == "Khyber Pakhtunkhwa"), col = '#94C053', lwd =3)
lines(Attacks ~ Year, data = subset(province, Province == "FATA"), col = '#54433D', lwd = 3)
lines(Attacks ~ Year, data = subset(province, Province == "Gilgit Baltistan"), col = '#7A75BC', lwd = 3)
legend(2004, 150, c("Total", "Sindh","Punjab", "Baluchistan", "KP", "FATA", "Gilgit Baltistan"), col = c("black", "#97BDAF", "#C453A4", "#C46643","#94C053","#54433D","#7A75BC"), pch = 19, cex = 1.25)
dev.off()

#Analyzing Province specific attack data
glmp1<- font="">glmmadmb(Attacks ~ Year, data = province, family = 'poisson', link = 'log')
glmp1.1<- font="">glmmadmb(Attacks ~ Province, data = province, family = 'nbinom1', link = 'log')
glmp1.2<- font="">glmmadmb(Attacks ~ Year, data = subset(province, Province != "Afghanistan"|Province != "Islamabad"), family = 'nbinom', link = 'log')
AIC(glmp1, glmp1.1, glmp1.2)
#negative binomial is the best model
#models also run with Province and Year, but can;t run them together because of lack of replicates
#log likelihood ratio tests
glmp1.3<- font="">update(glmp1.2, .~. - Province)
anova(glmp1.2, glmp1.3)
#Province is significant
summary(glmp1.2)
#All provinces are different from each other in number of attacks. Sindh and Baluchistan lead attacks
residfitplot(glmp1.2)
locscaleplot(glmp1.2)

#conditional inference tree
ptree1<- font="">ctree(Attacks ~ Year*Province, data = province)
predict(ptree1)

#killed per province
png("Killedprovince.png")
plot(Killed ~ Year, data = province, type = 'n', axes = TRUE, xlab = "Year", ylab = " Total Number of People Killed", cex.lab = 1.5, cex.axis = 1.25, bty = "l", ylim = c(0,550))
lines(killed ~ year, data = subset(sumyearN, year<= 2013), col = "black", lwd = 3)
lines(Killed ~ Year, data = subset(province, Province == "Sindh"), col = '#97BDAF', lwd = 3)
lines(Killed ~ Year, data = subset(province, Province == "Punjab"), col = '#C453A4', lwd = 3)
lines(Killed ~ Year, data = subset(province, Province == "Baluchistan"), col = '#C46643', lwd =3)
lines(Killed ~ Year, data = subset(province, Province == "Khyber Pakhtunkhwa"), col = '#94C053', lwd =3)
lines(Killed ~ Year, data = subset(province, Province == "FATA"), col = '#54433D', lwd = 3)
lines(Killed ~ Year, data = subset(province, Province == "Gilgit Baltistan"), col = '#7A75BC', lwd = 3)
legend(2002, 550, c("Total", "Sindh","Punjab", "Baluchistan", "KP", "FATA", "Gilgit Baltistan"), col = c("black","#97BDAF", "#C453A4", "#C46643","#94C053","#54433D","#7A75BC"), pch = 19, cex = 1.25)
dev.off()

#
glmkp1<- font="">glmmadmb(Killed ~ Province*Year, data = subset(province, Province == "Sindh"| Province == "Punjab"| Province == "Baluchistan"| Province == "Khyber Pakhtunkhwa"| Province == "FATA"| Province == "Gilgit Baltistan"), family = 'poisson', link = 'log')
glmkp1.1<- font="">glmmadmb(Killed ~ Province*Year, data = subset(province, Province == "Sindh"| Province == "Punjab"| Province == "Baluchistan"| Province == "Khyber Pakhtunkhwa"| Province == "FATA"| Province == "Gilgit Baltistan"), family = 'nbinom1', link = 'log')
glmkp1.2<- font="">glmmadmb(Killed ~ Province*Year, data = subset(province, Province == "Sindh"| Province == "Punjab"| Province == "Baluchistan"| Province == "Khyber Pakhtunkhwa"| Province == "FATA"| Province == "Gilgit Baltistan"), family = 'nbinom', link = 'log')
AIC(glmkp1, glmkp1.1, glmkp1.2)
#quasi binomial is the best
#log likelihood ratio tests
glmkp1.3<- font="">update(glmkp1.1, .~. - Province:Year)
anova(glmkp1.1, glmkp1.3)
#Interaction is not significant
glmkp1.4<- font="">update(glmkp1.3, .~. - Province)
anova(glmkp1.3, glmkp1.4)
#Province is significant
glmkp1.5<- font="">update(glmkp1.3, .~. - Year)
anova(glmkp1.3, glmkp1.5)
#Year is significant
residfitplot(glmkp1.3)
locscaleplot(glmkp1.3)
#plots look good
summary(glmkp1.1)
#Gilgit Baltistan is different from rest
#Increase in killed over time

#killed per attack - province
kpap<- font="">province$Killed/province$Attacks
province2<- font="">cbind(province, kpap)
kpac<- font="">city$Killed/city$Attacks
city2<- font="">cbind(city, kpac)

png("killedattackprovince.png")
plot(kpap ~ Year, data = province, type = 'n', axes = TRUE, xlab = "Year", ylab = "Total Number of People Killed per Attack", cex.lab = 1.5, cex.axis = 1.25, bty = "l", ylim = c(0,60))
lines(killed/attacks ~ year, data = subset(sumyearN, year <= 2013), col = "black", lwd = 3)
lines(kpap ~ Year, data = subset(province2, Province == "Sindh"), col = '#97BDAF', lwd = 3)
lines(kpap ~ Year, data = subset(province2, Province == "Punjab"), col = '#C453A4', lwd = 3)
lines(kpap ~ Year, data = subset(province2, Province == "Baluchistan"), col = '#C46643', lwd =3)
lines(kpap ~ Year, data = subset(province2, Province == "Khyber Pakhtunkhwa"), col = '#94C053', lwd =3)
lines(kpap ~ Year, data = subset(province2, Province == "FATA"), col = '#54433D', lwd = 3)
lines(kpap ~ Year, data = subset(province2, Province == "Gilgit Baltistan"), col = '#7A75BC', lwd = 3)
legend(2001, 58, c("Total","Sindh","Punjab", "Baluchistan", "KP", "FATA", "Gilgit Baltistan"), col = c("Black", "#97BDAF", "#C453A4", "#C46643","#94C053","#54433D","#7A75BC"), pch = 19, cex = 1.25)
dev.off()
boxplot(kpap ~ Province, data = subset(province2, Province != "Afghanistan" & Province != "Sndh"), las = 2)
#running models
kpap1<- font="">lm(log(kpap + 1) ~ Year*Province, data = province2)
dredge(kpap1)
#Province only is the best model
plot(kpap1)
#not so bad
summary(kpap1)
#conditional inference trees
kpaptree1<- font="">ctree(kpap ~ Year*Province, data = province2)

# @@ RGEDIT LANDMARK @@: City Analysis
#making a plot of attacks by most common cities in dataset
#Karachi, Quetta, Gilgit, Peshawar, Dera Ismail Khan, Hangu, 
city[order(city$Year, -city$Attacks, -city$Killed),]
png("AttacksCity.png")
plot(Attacks ~ Year, data = city, type = 'n', axes = TRUE, xlab = "Year", ylab = "Number of Attacks", cex.lab = 1.5, cex.axis = 1.25, bty = "l", ylim = c(0,150))
lines(attacks ~ year, data = subset(sumyearN, year<= 2013), col = "black", lwd = 3)
lines(Attacks ~ Year, data = subset(city, City == "Karachi"), col = '#97BDAF', lwd = 3)
lines(Attacks ~ Year, data = subset(city, City == "Quetta"), col = '#C453A4', lwd = 3)
lines(Attacks ~ Year, data = subset(city, City == "Peshawar"), col = '#C46643', lwd =3)
lines(Attacks ~ Year, data = subset(city, City == "Dera Ismail Khan"), col = '#94C053', lwd =3)
legend(2002, 150, c("Total","Karachi","Quetta", "Peshawar", "Dera Ismail Khan"), col = c("Black","#97BDAF", "#C453A4", "#C46643","#94C053"), pch = 19, cex = 1.25)
dev.off()

#model
#unable to run Year*City model because of lack of replicates. City only model run
glmc1<- font="">glmmadmb(Attacks ~ City*Year, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), family = 'poisson', link = 'log')
glmc1.1<- font="">glmmadmb(Attacks ~ City, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), family = 'nbinom1', link = 'log')
glmc1.2<- font="">glmmadmb(Attacks ~ City, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), family = 'nbinom', link = 'log')
AIC(glmc1, glmc1.1, glmc1.2)
#negative binomial model is the best
#log likelihood ratio tests
glmc1.3<- font="">update(glmc1.2, .~. - City*Year)
anova(glmc1.2, glmc1.3)
#City is significant
residfitplot(glmc1.2)
summary(glmc1.2)
#Attack rates differ per year per city
residfitplot(glmc1.2)
locscaleplot(glmc1.2)
#plots look good

#conditional inference trees
ptree1<- font="">ctree(Attacks ~ Year*City, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"))
predict(ptree1)


#killed per city
png("killedcity.png")
plot(Killed ~ Year, data = city, type = 'n', axes = TRUE, xlab = "Year", ylab = "Total Number of People Killed", cex.lab = 1.5, cex.axis =1.25, bty = "l", ylim = c(0,600))
lines(killed ~ year, data = subset(sumyearN, year <= 2013), col = "black", lwd = 3)
lines(Killed~ Year, data = subset(city, City == "Karachi"), col = '#97BDAF', lwd = 3)
lines(Killed ~ Year, data = subset(city, City == "Quetta"), col = '#C453A4', lwd = 3)
lines(Killed ~ Year, data = subset(city, City == "Peshawar"), col = '#C46643', lwd =3)
lines(Killed ~ Year, data = subset(city, City == "Dera Ismail Khan"), col = '#94C053', lwd =3)
legend(2001, 600, c("Total", "Karachi","Quetta", "Peshwara", "Dera Ismail Khan"), col = c("black", "#97BDAF", "#C453A4", "#C46643","#94C053","#54433D"), pch = 19, cex = 1.25)
dev.off()
#GLMs
glmkc1<- font="">glmmadmb(Killed ~ City*Year, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"),  family = 'poisson', link = 'log')
glmkc1.1<- font="">glmmadmb(Killed ~ City*Year, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), family = 'nbinom1', link = 'log')
glmkc1.2<- font="">glmmadmb(Killed ~ City*Year, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), family = 'nbinom', link = 'log')
AIC(glmkc1, glmkc1.1, glmkc1.2)
#quasi binomial is the best
#log likelihood ratio tests
glmkc1.3<- font="">update(glmkc1.1, .~. - City:Year)
anova(glmkc1.1, glmkc1.3)
#Interaction is not significant
glmkc1.4<- font="">update(glmkc1.3, .~. - City)
anova(glmkc1.3, glmkc1.4)
#City is significantly different
glmkc1.5<- font="">update(glmkc1.3, .~. - Year)
anova(glmkc1.3, glmkc1.5)
#year is significant
residfitplot(glmkc1.3)
summary(glmkc1.3)
#Increasing attacks per year. Quetta is marginally significantly different from all other cities
tree2<- font="">ctree(Killed ~ City*Year, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan", control = ctree_control(teststat = "quad", testtype = "MonteCarlo")
#Killed don't differ per year per city
residfitplot(glmkp1.1)
locscaleplot(glmkp1.1)

#interaction is not significant
png("killedattackcity.png")
plot(kpac ~ Year, data = city, type = 'n', axes = TRUE, xlab = "Year", ylab = "Total Number of People Killed per Attack", cex.lab = 1.5, cex.axis =1.25, bty = "l", ylim = c(0,60))
lines(killed/attacks ~ year, data = subset(sumyearN, year <= 2013), col = "black", lwd = 3)
lines(kpac~ Year, data = subset(city2, City == "Karachi"), col = '#97BDAF', lwd = 3)
lines(kpac ~ Year, data = subset(city2, City == "Quetta"), col = '#C453A4', lwd = 3)
lines(kpac ~ Year, data = subset(city2, City == "Peshawar"), col = '#C46643', lwd =3)
lines(kpac ~ Year, data = subset(city2, City == "Dera Ismail Khan"), col = '#94C053', lwd =3)
legend(2001, 60, c("Total","Karachi","Quetta", "Peshawar", "Dera Ismail Khan"), col = c("black","#97BDAF", "#C453A4", "#C46643","#94C053","#54433D","#7A75BC"), pch = 19, cex = 1.25)
dev.off()
#models
kpac1<- font="">lm(log(kpac+1) ~ Year*City, data = subset(city2, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"))
dredge(kpac1)
#Null model is best, followed by year
plot(kpac1)
#plots look fine
#conditional inference trees
library(party)
tree1<- font="">ctree(Attacks ~ Year + Province, data = province, control = ctree_control(teststat = "quad", testtype = "MonteCarlo"))
plot(tree1)
tree2<- font="">ctree(Attacks ~ Year + City, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), control = ctree_control(teststat = "quad"))
#across these cities violence picks up after 2011
tree3<- font="">ctree(Killed ~ Year + City, data = subset(city, City == "Karachi"|City == "Quetta"|City == "Peshawar"|City == "Dera Ismail Khan"), control = ctree_control(teststat = "quad"))
plot(tree3)
tree4<- font="">ctree(Killed ~ Year + Province, data = province, control = ctree_control(teststat = "quad"))
plot(tree4)
#same as attacks, pre 2011 Baluchistan/FATA/Kurram Region highest, followed by Sindh/Punjab and then Gilgit Baltistan/Islamabad. After 2011, number of Shias killed across all provinces/regions rise. 

#Examining province specific trees
tree4<- font="">ctree(Attacks ~ Province, data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "Khyber Pakhtunkhwa"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), control = ctree_control(minsplit = 10, teststat = "quad", testtype = "Teststatistic")))
boxplot(Attacks ~ Province, data = province)
plot(tree4)
tree4.1<- font="">ctree(Killed ~ Province + Year, data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), control = ctree_control(minsplit = 10, teststat = "quad", testtype = "Teststatistic"))
x11()
plot(tree4.1)
tree4.2<- font="">ctree(Killed/Year ~ Province, data = province, control = ctree_control(minsplit = 3))
plot(tree4.2)
#Nothing is important. No specific differences between provinces
#Examining cities
x11()
tree5<- font="">ctree(Attacks ~ City + Year, data = city, control = ctree_control(teststat = "quad", testtype = "Univariate"))
plot(tree5)
summary(tree5)
#Attacks is not changing by city, 
tree6<-ctree(Killed ~  City, data = city)
x11()
#Killed not changing every year by city
#same for killed/attacks

#Running glms for city and province data
glm4<- font="">glmmadmb(Attacks ~ Province*Year, data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), family = 'poisson')
glm4.1<- font="">glmmadmb(Attacks ~ Province*Year, data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan" ), family = 'nbinom1')
glm4.2<- font="">glmmadmb(Attacks ~ Province*Year, data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), family = 'nbinom', link = 'log') 
ICtab(glm4, glm4.1, glm4.2)
#negative binomial is the best model
glm4.3<- font="">glmmadmb(Attacks ~ Province + Year,  data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), family = 'nbinom', link = 'log') 
anova(glm4.2, glm4.3)
#no interaction
glm4.31<- font="">glmmadmb(Attacks ~ Province,  data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"), family = 'nbinom', link = 'log') 
anova(glm4.3, glm4.31) #year is important
glm4.32<- font="">glmmadmb(Attacks ~ Year,  data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"), family = 'nbinom', link = 'log') 
anova(glm4.3, glm4.32) #province is important
residfitplot(glm4.3) #looks ok
locscaleplot(glm4.3) #looks ok
disperse(glm4.3, province) #model is under-dispersed 0.813
summary(glm4.3)
#running tree model with data
tree2<- font="">ctree(Attacks ~ Province + Year,  data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), control = ctree_control(minsplit =1, mtry = 1))
plot(tree2)
nodes(tree2,1)
#model suggests that when looking at province-wide data 
library(partykit)
z<- font="">subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan")
tree2<- font="">ctree(Attacks ~ Province + Year,  data = subset(province, Province == "Sindh"|Province == "Punjab"|Province == "KP"|Province == "Baluchistan"|Province == "Gilgit Baltistan"), control = ctree_control(minsplit =10))

# @@ RGEDIT LANDMARK @@: Comparing number of ppl killed in terrorism vs anti-Shia violence
gen<- font="">read.csv("generalviolence.csv")
#removing 2015 and 2016 data
#plotting Shia killed and general deaths
ge2<- font="">subset(gen, Year <=2014)
png("GeneralvsShia.png")
plot(killed ~ year, data = subset(sumyearN, year > 2002), xlab = "Year", ylab = "Number of People Killed", ylim = c(0,3200), bty = "l", cex.axis = 1.25, cex.lab = 1.5, type = 'n')
lines(killed ~ year, data = subset(sumyearN, year > 2002), col = "#c14a34", lwd= 2)
lines(Civilians ~ Year, data = ge2, col = "#6c44c0", lwd= 2)
legend(2003, 3000, c("Terrorism", "Shia Killing"), col = c("#6c44c0","#c14a34"), lty = c(1,1), lwd = c(3,3), cex = 1.25)
dev.off()
zz<-(subset(sumyearN, year <= 2003)$killed/ge2$Civilians)*100
#plotting percent deaths by Shias
plot(killed ~ year, data = subset(sumyearN, year =< 2003), type = "n", xlab = "Year", ylab = "Percent of Shia deaths in Total", ylim = c(0,100), bty = "l", cex.axis = 1.25, cex.lab = 1.5)

#running model
k1<-glmmadmb(Civilians ~ Year, data = ge2, family = "poisson", link = "log")
k2<-glmmadmb(Civilians ~ Year, data = ge2, family = "nbinom", link = "log")
k3<-glmmadmb(Civilians ~ Year, data = ge2, family = "nbinom1", link = "log")
AIC(k1, k2, k3)
#negative binomial is the best model
summary(k2)
#Year is significant
lines(zz ~ Year, data = ge2, col = "#71b54b", lwd = 2)
sum(zz)/length(ge2$Year)
#conditional inference tree
tree5<-ctree(zz ~ Year, data = ge2, control = ctree_control(testtype = "Univariate"))
plot(tree5)


No comments:

Post a Comment