
library(glmnet)

#run_settings
ModelInputs=c("Level1","Level2","Level3")
#note in this code FEorNME corresponds to FNE in the paper
DependentVar=c("FE","FEorNME");

#optimal lambda derived outside of this code
mylambdap=c(0.000837678,0.000837678,0.001060818,0.004375479,0.002728333,0.002728333)

#####################################################
#####################################################
#####################################################

#function to summarize key performance factors from a confusion matrix

funSummarizeCM <-function(df,allresultsfile,mystring){
  if (nrow(df)>1){
    myrecall=df[2,2]/sum(df[,2]) #aka sensitivity
    specificity=df[1,1]/sum(df[,1])
    myprecision=df[2,2]/sum(df[2,]) #aka PPV
    F1=2*myrecall*myprecision/(myrecall+myprecision)
    acc=sum(diag(df))/sum(df)
  } else{
    #which one is missing
    if (rownames(df)[1]=="0"){
      #no failures predicted
      myrecall=0;
      myprecision=0;
      F1=0;
      specificity=1;
      acc=sum(diag(df))/sum(df)
    } else {
      #everyone predicted to be failure
      myrecall=1;
      myprecision=df[1,2]/sum(df)
      F1=2*myrecall*myprecision/(myrecall+myprecision)
      specificity=0;
      acc=myprecision;
    }
  }
  Cmline0=paste("","0",'1',sep=',')
  if (nrow(df)>1){
    if(ncol(CM)>1){ # 2x2 matrix
      Cmline1=paste("0",CM[1,1],CM[1,2], sep=',')
      Cmline2=paste("1",CM[2,1],CM[2,2], sep=',')
    } else if (colnames(df)[1]=="0"){
      Cmline1=paste("0",CM[1,1],"0", sep=',')
      Cmline2=paste("1",CM[2,1],"0", sep=',')
    } else {
      Cmline1=paste("0","0",CM[1,1], sep=',')
      Cmline2=paste("1","0",CM[2,1], sep=',')
    }
  } else if (rownames(df)[1]=="0"){
    if(ncol(CM)>1){ # 1x2 matrix - with zero row
      Cmline1=paste("0",CM[1,1],CM[1,2], sep=',')
      Cmline2=paste("1","0","0", sep=',')
    } else if (colnames(df)[1]=="0"){ #1x1 matrix, 0,0  
      Cmline1=paste("0",CM[1,1],"0", sep=',')
      Cmline2=paste("1","0","0", sep=',')
    } else { #1x1 matrix, 0,1 element
      Cmline1=paste("0","0",CM[1,1], sep=',')
      Cmline2=paste("1","0","0", sep=',')
    }
  } else { # just the one row
    if(ncol(CM)>1){ # 1x2 matrix - with zero row
      Cmline1=paste("0","0","0", sep=',')
      Cmline2=paste("1",CM[1,1],CM[1,2], sep=',')
    } else if (colnames(df)[1]=="0"){ #1x1 matrix, 1,0  
      Cmline1=paste("0","0","0", sep=',')
      Cmline2=paste("1",CM[1,1],"0", sep=',')
    } else { #1x1 matrix, 0,1 element
      Cmline1=paste("0","0","0", sep=',')
      Cmline2=paste("1","0",CM[1,1], sep=',')
    }
  }
  
  
  if (file.exists(allresultsfile)){
    cat(c(mystring,",","accuracy,",acc,",F1 Score,",F1,",recall(sensitivity),",myrecall,",precision(PPV),",myprecision,",specificity,",specificity,",",Cmline0,",",Cmline1,",",Cmline2,"\n"),file=allresultsfile, append=TRUE)
  } else {
    cat(c(mystring,",","accuracy,",acc,",F1 Score,",F1,",recall(sensitivity),",myrecall,",precision(PPV),",myprecision,",specificity,",specificity,",",Cmline0,",",Cmline1,",",Cmline2,"\n"),file=allresultsfile)#,sep="\n")
  }
}

##############################################################3

#read in data
my_data<-read.csv("/data/Paper_Final.csv",header=T)
#summary(my_data)
nrow(my_data)

################# data preprocessing ################3333

#create binary indicators of class
my_data$IsA = ifelse((my_data$norm_class == "A"), 1, 0);
my_data$IsB = ifelse((my_data$norm_class == "B"), 1, 0);
my_data$IsM = ifelse((my_data$norm_class == "M"), 1, 0);

#### adding volume variables
my_data$VolA=my_data$VolA_ME+my_data$VolA_FE+my_data$VolA_NME;
my_data$VolB=my_data$VolB_ME+my_data$VolB_FE+my_data$VolB_NME;
my_data$VolM=my_data$VolM_ME+my_data$VolM_FE+my_data$VolM_NME;

#vectorize Moodies initial
my_data$MIR_A = ifelse((my_data$MIR_mvb == "A"), 1, 0);
my_data$MIR_AA = ifelse((my_data$MIR_mvb == "AA"), 1, 0);
my_data$MIR_AAA = ifelse((my_data$MIR_mvb == "AAA"), 1, 0);
my_data$MIR_B = ifelse((my_data$MIR_mvb == "B"), 1, 0);
my_data$MIR_BA = ifelse((my_data$MIR_mvb == "BA"), 1, 0);
my_data$MIR_BAA = ifelse((my_data$MIR_mvb == "BAA"), 1, 0);
my_data$MIR_blank = ifelse((my_data$MIR_mvb == "blank"), 1, 0);
my_data$MIR_C = ifelse((my_data$MIR_mvb == "C" | my_data$MIR_mvb == "CA" |my_data$MIR_mvb == "CAA"), 1, 0);
my_data$MIR_NR = ifelse((my_data$MIR_mvb == "NR"), 1, 0);

#################################################################

ct=0

#do not change loop settings as they relate directly to the optimal labmda.
for (mydependent in 1:2){ #loop throught the dependent variable
    for (myinputs in 1:3){# loop through the model levels
    
      #advance the ct for labmda
      ct=ct+1;
      
        
    ###################Run Settings################################
    #datastring
    datestring=paste(substr(toString(Sys.Date()),6,7 ),substr(toString(Sys.Date()),9,10),sep = "", collapse = NULL) 
    
    #ModelInputs
    modelstring=ModelInputs[myinputs]
    #dependent
    dependentstring=DependentVar[mydependent]
    
    #namestring
    mynamestring=paste(datestring,modelstring,dependentstring);
    
    #lambdastring
    lambdafile=paste("/results/",datestring,"_lambda.csv",sep = "");
    
    #allresultsfile
    allresultsfile=paste("/results/",datestring,"_all_results.csv",sep = "");
    
    ################ Data Processing ###################
    #dependent variable
    
    if (mydependent==1){
      #the dependent varaible selection - FE
      my_data$failed = ifelse(my_data$norm_label == "FE", 1, 0)
    } else {
      #the dependent varaible selection - NE&FE
      my_data$failed = ifelse((my_data$norm_label == "FE" | my_data$norm_label == "NME" ), 1, 0)
    }
    
    
    ## copying the data for subset selection 
    my_sdata = my_data;
    
   
    #ModelInputs=c("Level1","Level2","Level3")
    #Level1 - Amount, Class, TT features, MIR.
    #Level2a - Add: The presence of SSUP 1024, the volume in As,Ms and Bs, fraction in As,Ms and Bs . 
    #Level3b - add main assigned topic 
    

    if (myinputs==1){
      my_sdata<-my_sdata[c(1,47,71:127,165:170,203:205,209:217,218)]
    
    }else if (myinputs==2){
      my_sdata<-my_sdata[c(1,47,71:127,165:170,203:205,209:217,172,206:208,129:131,218)]
    
    } else {
      my_sdata<-my_sdata[c(1,47,71:127,165:170,203:205,209:217,172,206:208,129:131,173:202,218)]
    }
    
   
    
    ########################### Running the analysis once on full data to report regression coefficients ###################
    
    datatrain <-my_sdata
    
    #remove RandSplit ...
    datatrain$RandSplit<-1;
    
    #convert training data to matrix format
    x<- model.matrix(failed~.,datatrain)
    y <- datatrain$failed; 
    
    cv.out <- glmnet(x,y,alpha=1,family="binomial", type.measure=mse, standardize=T, lambda=mylambdap[ct] )
    
    #get coefficients
    df.coef <- coef(cv.out, s = "lambda.1se")
    df.coef = data.frame(name = df.coef@Dimnames[[1]][df.coef@i + 1], coefficient = df.coef@x)
    
    #write the coefficients to a file - version independent
    my_filename=paste("/results/",mynamestring,"_coef.csv",sep = "")
    write.csv(df.coef, file = my_filename)
    
    #summarize and save the (insample) performance
    pred  <- predict(cv.out,newx=x,s=cv.out$lambda.1se,type="response")
    mymodel_predict <- rep(0,nrow(datatrain))
    mymodel_predict[pred>.5] <- 1
    CM=table(mymodel_predict,datatrain$failed)
    funSummarizeCM(CM,allresultsfile,mynamestring)
    
    
    ########################### Running the analysis ad 10 fold CV ###################
    #Split up the data into random 10ths according to the prospectus level split
    
    for (i in 1:10){
      #Split up the data into random 10ths according to the prospectus level split
      datatrain <-my_sdata[(my_sdata$RandSplit < i) | (my_sdata$RandSplit > i), ]
      datatest <- my_sdata[(my_sdata$RandSplit == i), ]
      
      #remove RandSplit ...
      datatrain$RandSplit<-1;
      
      #convert training data to a matrix format
      x<- model.matrix(failed~.,datatrain)
      y <- datatrain$failed; 
    
      xtest <-model.matrix(failed~.,datatest)
      ytest <- datatest$failed; 
      
      cv.out <- glmnet(x,y,alpha=1,family="binomial", type.measure=mse, standardize=T, lambda=mylambdap[ct] )
      
      #get coef
      df.coef <- coef(cv.out, s = "lambda.1se")
      df.coef = data.frame(name = df.coef@Dimnames[[1]][df.coef@i + 1], coefficient = df.coef@x)
    
      #write the coefficients to a file - version independent
      #remove "#" to print coefficients from all cross validation runs
      #my_filename=paste("/results/",mynamestring,"_CV",i,"_coef.csv",sep = "")
      #write.csv(df.coef, file = my_filename)
      
      #summarize and save the performance
      pred  <- predict(cv.out,newx=xtest,s="lambda.1se",type="response")
      mymodel_predict <- rep(0,nrow(datatest))
      mymodel_predict[pred>.5] <- 1
      CM=table(mymodel_predict,datatest$failed)
      #comment "in" if you want the performance measures by cross validation cut
      #funSummarizeCM(CM,allresultsfile,mynamestring)
     
    
      
    
    }# end of cross validation loop
    
    
    ###############################################
     }
}






###################################################3
############# END
##########################################3
