knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
data = read.csv('./iris.data', header=TRUE, sep=',', na.strings="", stringsAsFactors =TRUE)
View(data)
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
View(data)
summary(data)
names(data)[-1]
names(data)[:-1]
names(data)[,-1]
names(data)[-1]
names(data)[]
names(data)[-2]
names(daata)
names(data)
length(names(data))
names(data)[length(names(data))]  = 'target'
View(data)
View(data)
as.numeric(data$target)
data$target
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
data$target = as.numeric(data$target)
data$target = as.numeric(data$target)
as.numeric(data$target)
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
names(data)[length(names(data))]  = 'target'
data$target = as.numeric(data$target)
View(data)
write.csv(data, 'iris.csv', row.names = FALSE)
write.csv(data, './../iris.csv', row.names = FALSE)
View(data)
names(data)
summary(data)
numeric_columns = c('V1','V2','V3','V4')
for(x in numeric_columns){
data[[x]] = cut(data[[x]],
c(-Inf,quantile(data[[x]],0.25),quantile(data[[x]],0.5),quantile(data[[x]],0.75),Inf),
labels=c(1,2,3,4))
}
summary(data)
for(f in names(data)){
data[[f]] = as.factor(data[[f]])
}
summary(data)
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
dataencoder <- function (data) {
#encoding data
must_convert<-sapply(data,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(data[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
data_num<-cbind(data[,!must_convert],M2)
data_num <- as.data.frame(data_num)
for(tmp_f in names(data)){
data_num[[tmp_f]] = as.factor(data_num[[tmp_f]] )
data_num[[tmp_f]]  = droplevels(data_num[[tmp_f]] )
}
data_num
}
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
names(data)[length(names(data))]  = 'target'
data$target = as.numeric(data$target)
write.csv(data, './../iris.csv', row.names = FALSE)
numeric_columns = c('V1','V2','V3','V4')
for(x in numeric_columns){
data[[x]] = cut(data[[x]],
c(-Inf,quantile(data[[x]],0.25),quantile(data[[x]],0.5),quantile(data[[x]],0.75),Inf),
labels=c(1,2,3,4))
}
for(f in names(data)){
data[[f]] = as.factor(data[[f]])
}
View(data)
data<- dataencoder(data)
summary(data)
data<- dataencoder(data)
summary(data)
data_enc = data
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
library(caret)
library(stringr)
library(outliers)
library(editrules)
library(dplyr)
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
data_enc <- data.frame(predict(dmy, newdata = data_enc))
View(data_enc)
#if a feature has only two levels we should only keep one column
#As our convention, we always keep the first one
cols = c()
tmp <- gsub("\\..*","",names( data_enc ))
for(name in names(data)){
# a = grepl( name , tmp ,fixed=TRUE)
a = tmp == name
if(sum(a)==2){
cols <- append(cols, min(which(a == TRUE)))
}else{
cols <- append(cols, which(a == TRUE))
}
}
data_enc <- data_enc[,cols]
data_enc$target <- data$target
numerical_columns
numeric_columns = c('V1','V2','V3','V4')
# Taking care of  the integer columns : If x_ij = 1 then x_i(j+1) should be one as well for numeric features
for(v in numeric_columns){
for(i in seq(2,nlevels(data[[v]]),1)){
a =  as.numeric(as.character(data_enc[[paste(v,toString(i),sep = ".")]]))
b =  as.numeric(as.character(data_enc[[paste(v,toString(i-1),sep = ".")]]))
data_enc[[paste(v,toString(i),sep = ".")]] =  as.numeric(a|b)
}
}
rm(dmy)
View(data_enc)
View(data)
summary(data)
summary(data_enc)
View(data_enc)
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
library(caret)
library(stringr)
library(outliers)
library(editrules)
library(dplyr)
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
dataencoder <- function (data) {
#encoding data
must_convert<-sapply(data,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(data[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
data_num<-cbind(data[,!must_convert],M2)
data_num <- as.data.frame(data_num)
for(tmp_f in names(data)){
data_num[[tmp_f]] = as.factor(data_num[[tmp_f]] )
data_num[[tmp_f]]  = droplevels(data_num[[tmp_f]] )
}
data_num
}
one_hot_encoder <- function(data, numeric_columns){
data_enc = data
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
data_enc <- data.frame(predict(dmy, newdata = data_enc))
#if a feature has only two levels we should only keep one column
#As our convention, we always keep the first one
cols = c()
tmp <- gsub("\\..*","",names( data_enc ))
for(name in names(data)){
# a = grepl( name , tmp ,fixed=TRUE)
a = tmp == name
if(sum(a)==2){
cols <- append(cols, min(which(a == TRUE)))
}else{
cols <- append(cols, which(a == TRUE))
}
}
data_enc <- data_enc[,cols]
data_enc$target <- data$target
# Taking care of  the integer columns : If x_ij = 1 then x_i(j+1) should be one as well for numeric features
for(v in numeric_columns){
for(i in seq(2,nlevels(data[[v]]),1)){
a =  as.numeric(as.character(data_enc[[paste(v,toString(i),sep = ".")]]))
b =  as.numeric(as.character(data_enc[[paste(v,toString(i-1),sep = ".")]]))
data_enc[[paste(v,toString(i),sep = ".")]] =  as.numeric(a|b)
}
}
rm(dmy)
data_enc
}
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
names(data)[length(names(data))]  = 'target'
data$target = as.numeric(data$target)
write.csv(data, './../iris.csv', row.names = FALSE)
numeric_columns = c('V1','V2','V3','V4')
for(x in numeric_columns){
data[[x]] = cut(data[[x]],
c(-Inf,quantile(data[[x]],0.25),quantile(data[[x]],0.5),quantile(data[[x]],0.75),Inf),
labels=c(1,2,3,4))
}
for(f in names(data)){
data[[f]] = as.factor(data[[f]])
}
data<- dataencoder(data)
data_enc <- one_hot_encoder(data, numeric_columns)
View(data_enc)
write.csv(data_enc, './../iris_enc.csv', row.names = FALSE)
summary(data)
summary(data_enc)
View(data_enc)
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
library(caret)
library(stringr)
library(outliers)
library(editrules)
library(dplyr)
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
dataencoder <- function (data) {
#encoding data
must_convert<-sapply(data,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(data[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
data_num<-cbind(data[,!must_convert],M2)
data_num <- as.data.frame(data_num)
for(tmp_f in names(data)){
data_num[[tmp_f]] = as.factor(data_num[[tmp_f]] )
data_num[[tmp_f]]  = droplevels(data_num[[tmp_f]] )
}
data_num
}
one_hot_encoder <- function(data, numeric_columns){
data_enc = data
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
data_enc <- data.frame(predict(dmy, newdata = data_enc))
#if a feature has only two levels we should only keep one column
#As our convention, we always keep the first one
cols = c()
tmp <- gsub("\\..*","",names( data_enc ))
for(name in names(data)){
# a = grepl( name , tmp ,fixed=TRUE)
a = tmp == name
if(sum(a)==2){
cols <- append(cols, min(which(a == TRUE)))
}else{
cols <- append(cols, which(a == TRUE))
}
}
data_enc <- data_enc[,cols]
data_enc$target <- data$target
# Taking care of  the integer columns : If x_ij = 1 then x_i(j+1) should be one as well for numeric features
for(v in numeric_columns){
for(i in seq(2,nlevels(data[[v]]),1)){
a =  as.numeric(as.character(data_enc[[paste(v,toString(i),sep = ".")]]))
b =  as.numeric(as.character(data_enc[[paste(v,toString(i-1),sep = ".")]]))
data_enc[[paste(v,toString(i),sep = ".")]] =  as.numeric(a|b)
}
}
rm(dmy)
data_enc
}
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
names(data)[length(names(data))]  = 'target'
data$target = as.numeric(data$target)
summary(data)
names(data)!='target'
names(data)[names(data)!='target']
normalize <- function(data){
for(f in names(data)[names(data)!='target']){
range_f = max(data[[f]]) - min(data[[f]])
data[[f]] = (data[[f]] - min(data[[f]]))/range_f
}
data
}
data_org = normalize(data)
View(data_org)
summary(data_org)
summary(data)
data_norm = normalize(data)
(5.100 - 4.3)/3.4
View(data)
View(data_norm)
(5.100 - 4.3)/3.6
write.csv(data_norm, './../iris.csv', row.names = FALSE)
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
library(caret)
library(stringr)
library(outliers)
library(editrules)
library(dplyr)
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
dataencoder <- function (data) {
#encoding data
must_convert<-sapply(data,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(data[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
data_num<-cbind(data[,!must_convert],M2)
data_num <- as.data.frame(data_num)
for(tmp_f in names(data)){
data_num[[tmp_f]] = as.factor(data_num[[tmp_f]] )
data_num[[tmp_f]]  = droplevels(data_num[[tmp_f]] )
}
data_num
}
one_hot_encoder <- function(data, numeric_columns){
data_enc = data
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
data_enc <- data.frame(predict(dmy, newdata = data_enc))
#if a feature has only two levels we should only keep one column
#As our convention, we always keep the first one
cols = c()
tmp <- gsub("\\..*","",names( data_enc ))
for(name in names(data)){
# a = grepl( name , tmp ,fixed=TRUE)
a = tmp == name
if(sum(a)==2){
cols <- append(cols, min(which(a == TRUE)))
}else{
cols <- append(cols, which(a == TRUE))
}
}
data_enc <- data_enc[,cols]
data_enc$target <- data$target
# Taking care of  the integer columns : If x_ij = 1 then x_i(j+1) should be one as well for numeric features
for(v in numeric_columns){
for(i in seq(2,nlevels(data[[v]]),1)){
a =  as.numeric(as.character(data_enc[[paste(v,toString(i),sep = ".")]]))
b =  as.numeric(as.character(data_enc[[paste(v,toString(i-1),sep = ".")]]))
data_enc[[paste(v,toString(i),sep = ".")]] =  as.numeric(a|b)
}
}
rm(dmy)
data_enc
}
normalize <- function(data){
for(f in names(data)[names(data)!='target']){
range_f = max(data[[f]]) - min(data[[f]])
data[[f]] = (data[[f]] - min(data[[f]]))/range_f
}
data
}
data = read.csv('./iris.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
names(data)[length(names(data))]  = 'target'
data$target = as.numeric(data$target)
data_norm = normalize(data)
numeric_columns = c('V1','V2','V3','V4')
View(data)
# for(x in numeric_columns){
#   data[[x]] = cut(data[[x]],
#                 c(-Inf,quantile(data[[x]],0.25),quantile(data[[x]],0.5),quantile(data[[x]],0.75),Inf),
#                 labels=c(1,2,3,4))
# }
for(x in numeric_columns){
data[[x]] = cut(data[[x]],
c(-Inf,quantile(data[[x]],0.2),quantile(data[[x]],0.4),quantile(data[[x]],0.6),quantile(data[[x]],0.8),Inf),
labels=c(1,2,3,4,5))
}
for(f in names(data)){
data[[f]] = as.factor(data[[f]])
}
data<- dataencoder(data)
data_enc <- one_hot_encoder(data, numeric_columns)
write.csv(data_enc, './../iris_enc_5.csv', row.names = FALSE)
View(data)
View(data_norm)
View(data_enc)
knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE )
library(caret)
library(stringr)
library(outliers)
library(editrules)
library(dplyr)
rm(list=ls())
setwd("/Users/sina/Documents/GitHub/StrongTree/DataSets/non-categorical/original files/")
dataencoder <- function (data) {
#encoding data
must_convert<-sapply(data,is.factor)       # logical vector telling if a variable needs to be displayed as numeric
M2<-sapply(data[,must_convert],unclass)    # data.frame of all categorical variables now displayed as numeric
data_num<-cbind(data[,!must_convert],M2)
data_num <- as.data.frame(data_num)
for(tmp_f in names(data)){
data_num[[tmp_f]] = as.factor(data_num[[tmp_f]] )
data_num[[tmp_f]]  = droplevels(data_num[[tmp_f]] )
}
data_num
}
one_hot_encoder <- function(data, numeric_columns){
data_enc = data
#Now we tuurn all categorical  features into one-hot vectors
dmy <- dummyVars(" ~ .-target", data = data_enc)
data_enc <- data.frame(predict(dmy, newdata = data_enc))
#if a feature has only two levels we should only keep one column
#As our convention, we always keep the first one
cols = c()
tmp <- gsub("\\..*","",names( data_enc ))
for(name in names(data)){
# a = grepl( name , tmp ,fixed=TRUE)
a = tmp == name
if(sum(a)==2){
cols <- append(cols, min(which(a == TRUE)))
}else{
cols <- append(cols, which(a == TRUE))
}
}
data_enc <- data_enc[,cols]
data_enc$target <- data$target
# Taking care of  the integer columns : If x_ij = 1 then x_i(j+1) should be one as well for numeric features
for(v in numeric_columns){
for(i in seq(2,nlevels(data[[v]]),1)){
a =  as.numeric(as.character(data_enc[[paste(v,toString(i),sep = ".")]]))
b =  as.numeric(as.character(data_enc[[paste(v,toString(i-1),sep = ".")]]))
data_enc[[paste(v,toString(i),sep = ".")]] =  as.numeric(a|b)
}
}
rm(dmy)
data_enc
}
normalize <- function(data){
for(f in names(data)[names(data)!='target']){
range_f = max(data[[f]]) - min(data[[f]])
data[[f]] = (data[[f]] - min(data[[f]]))/range_f
}
data
}
data = read.csv('./sonar.all-data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
data = read.csv('./sonar.data', header=FALSE, sep=',', na.strings="", stringsAsFactors =TRUE)
