Posts

Showing posts from January, 2019

Loan - R - Kagle Competition

data.combined <- read.csv("loan.csv", header = TRUE) data.combined.BackUp <- data.combined str(data.combined) ############################################################################################### #GENERAL OVERVIEW #from issue_d I extrapolate the year year_loan = as.character(data.combined$issue_d) unique(year_loan) library(stringr) year_loan = strsplit(year_loan, "-") year_loan = sapply(year_loan, "[", 2) data.combined$year_loan = as.factor(year_loan) #some general graphs to have an overview #absolute number of loans by year library(ggplot2) ggplot(data.combined, aes(x=year_loan)) +   geom_bar(fill="steelblue") +   ggtitle("Absolute number of loans by year") #the absolute number of loans is increasing across years #total amount given as a loan by year library(plyr) tot_loan_year = ddply(data.combined, .(year_loan), summarise, tot_loan = sum(loan_amnt)) tot_loan_year ggplot(tot_loan_year, aes...

Titanic - R - Kaggle Competition

#################################################################################### # #PRELIMINARY ANALYSIS # #################################################################################### #upload dataset train <- read.csv("train.csv", header = TRUE) test <- read.csv("test.csv", header = TRUE) #merge two datasets in one #first thing I had the column/variable survived in test set test.survived = data.frame(Survived = rep("None", nrow(test)), test[,]) #then i have to adjust the position of the first two columns the other way around test.survived[ , c(1,2)] <- test.survived[ , c(2,1)] colnames(test.survived)[c(1,2)] <- colnames(test.survived)[c(2,1)] #finally i can combine since they perfect match each other data.combined = rbind(train, test.survived) #remove columns passengerID data.combined[1] <- NULL #some information on the dataframe str(data.combined) #I don't want Survived and Pclass to be int or chr:...