@rajiv2806 wrote:
I have almost implemented the same code that is presented in the workshop tutorial on the given dataset.
But when i implement the final "DecisionTreeClassifier " i am not getting the expected output. My code and the output i'm getting is pasted below:
import pandas as pd import numpy as np #Reading Files train = pd.read_csv('D:/AnalyticsVidya/Workshop/train.csv') test = pd.read_csv('D:/AnalyticsVidya/Workshop/test.csv') #train.head() #test.head() #Null Fields Check #train.apply(lambda x: sum(x.isnull())) #test.apply(lambda x: sum(x.isnull())) from scipy.stats import mode var_to_impute = ['Workclass','Occupation','Native.Country'] for var in var_to_impute: train[var].fillna(mode(train[var]).mode[0],inplace=True) test[var].fillna(mode(test[var]).mode[0],inplace=True) #train.apply(lambda x: sum(x.isnull())) #test.apply(lambda x: sum(x.isnull())) categorical_variables = list(train.dtypes.loc[train.dtypes == 'object'].index) #train[categorical_variables].apply(lambda x: len(x.unique())) #test[categorical_variables[:len(categorical_variables)-1]].apply(lambda x: len(x.unique())) #the test dataset i am using the range because the test column ('Income.Group') is not present in the given file for column in categorical_variables: #Determine the categories to combine frq = train[column].value_counts()/train.shape[0] categories_to_combine = frq.loc[frq.values<0.05].index #loop over all categories and combine them as others for cat in categories_to_combine: train[column].replace({cat:'Others'},inplace=True) test[column].replace({cat:'Others'},inplace=True) #train[categorical_variables].apply(lambda x: len(x.unique())) #test[categorical_variables[:len(categorical_variables)-1]].apply(lambda x: len(x.unique())) from sklearn.tree import DecisionTreeClassifier dependent_variable = 'Income.Group' independent_variable = [x for x in train.columns if x not in ['ID',dependent_variable]] model = DecisionTreeClassifier(max_depth = 10,min_samples_leaf = 100, max_features = 'sqrt') model.fit(train[independent_variable],train[dependent_variable]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() for var in categorical_variables: train[var] = le.fit_transform(train[var]) predictions_train = model.predict(train[independent_variable]) print le.inverse_transform(predictions_train)[:10] print('------------------------------------------------------------------------------------------') for var in categorical_variables[:len(categorical_variables)-1]: test[var] = le.fit_transform(test[var]) predictions_test = model.predict(test[independent_variable]) print le.inverse_transform(predictions_test)[:10]
Actual Output of above code:
['<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '>50K' '<=50K'
'>50K']
['Others' 'Others' 'Others' 'Others' 'Others' 'Others' 'Others' 'Others'
'Others' 'Others']Expected Output should be similar to something like this:
['<=50K' '>50K' '<=50K' '<=50K' '>50K' '>50K' '<=50K' '<=50K' '<=50K' '>50K']
['<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '>50K' '<=50K'
'>50K']Can you please suggest where is the mistake in this code?
Posts: 1
Participants: 1