Kaggle Titanic data

In this article we will explore Titanic dataset present in Kaggle.We are going to use caret package, the reason for this is it unifies different models under same syntax and also it streamlines training and regularizing model. You will se how easily we can perform crossvalidation while training lda.

library(caret)
train=read.csv("titanictrain.csv")
train$Cabin=NULL
train$Ticket=NULL
inTrain=createDataPartition(train$Survived, p=0.75, list=F)
training=train[inTrain,]
testing=train[-inTrain,]
training=na.omit(training)
testing=na.omit(testing)
training$Survived=as.factor(training$Survived)
testing$Survived=as.factor(testing$Survived)
qplot(data=training, Sex, Age, color=Survived, facets = .~Pclass, size=I(4))

qplot(data=training, SibSp, Parch, color=Survived, facets = .~Embarked, size=I(4))

Below we fit LDA and check accuracy

trcont=trainControl(method = "repeatedcv", number = 10, repeats=10)
lda.fit=train(Survived~Pclass+Sex+Age+SibSp+Parch+Fare,data=training,method="lda",trControl=trcont)
ldacl=predict(lda.fit,newdata = testing)
confusionMatrix(ldacl,testing$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 80 29
##          1 13 58
##                                          
##                Accuracy : 0.7667         
##                  95% CI : (0.698, 0.8264)
##     No Information Rate : 0.5167         
##     P-Value [Acc > NIR] : 4.398e-12      
##                                          
##                   Kappa : 0.53           
##  Mcnemar's Test P-Value : 0.02064        
##                                          
##             Sensitivity : 0.8602         
##             Specificity : 0.6667         
##          Pos Pred Value : 0.7339         
##          Neg Pred Value : 0.8169         
##              Prevalence : 0.5167         
##          Detection Rate : 0.4444         
##    Detection Prevalence : 0.6056         
##       Balanced Accuracy : 0.7634         
##                                          
##        'Positive' Class : 0              
## 

Lets try some tree based method

tree.fit=train(Survived~Pclass+Sex+Age+SibSp+Parch+Fare,data=training,method="rpart",trControl=trcont)
treecl=predict(tree.fit,newdata = testing)
confusionMatrix(treecl,testing$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 92 45
##          1  1 42
##                                           
##                Accuracy : 0.7444          
##                  95% CI : (0.6742, 0.8064)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 3.172e-10       
##                                           
##                   Kappa : 0.4798          
##  Mcnemar's Test P-Value : 2.298e-10       
##                                           
##             Sensitivity : 0.9892          
##             Specificity : 0.4828          
##          Pos Pred Value : 0.6715          
##          Neg Pred Value : 0.9767          
##              Prevalence : 0.5167          
##          Detection Rate : 0.5111          
##    Detection Prevalence : 0.7611          
##       Balanced Accuracy : 0.7360          
##                                           
##        'Positive' Class : 0               
## 

Below we try our hands on random forest

rf.fit=train(Survived~Pclass+Sex+Age+SibSp+Parch+Fare,data=training,method="rf",trControl=trcont)
rfcl=predict(rf.fit,newdata = testing)
confusionMatrix(rfcl,testing$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 87 32
##          1  6 55
##                                           
##                Accuracy : 0.7889          
##                  95% CI : (0.7219, 0.8461)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 3.750e-14       
##                                           
##                   Kappa : 0.5732          
##  Mcnemar's Test P-Value : 5.002e-05       
##                                           
##             Sensitivity : 0.9355          
##             Specificity : 0.6322          
##          Pos Pred Value : 0.7311          
##          Neg Pred Value : 0.9016          
##              Prevalence : 0.5167          
##          Detection Rate : 0.4833          
##    Detection Prevalence : 0.6611          
##       Balanced Accuracy : 0.7838          
##                                           
##        'Positive' Class : 0               
## 

Since rf has best accuracy we will predict test set using this model

test=read.csv("titanictest.csv")
test$Cabin=NULL
test$Ticket=NULL
test=na.omit(test)
rfpred=predict(rf.fit, test)
rfpred=as.data.frame(rfpred)
rfpred$PassengerId=test$PassengerId
rfpred=rfpred[,c("PassengerId","rfpred")]
names(rfpred)[names(rfpred)=="rfpred"]="Survived"