Exploratory Data Analysis

1 minute read

  1. Performing an Exploratory Data Analysis on the Staff Promotion Data set. The Summary of a DataFrame helps to understand the type of variable, data type and presence of null values.

  2. Importing libraries:
     import pandas as pd
     import numpy as np
     import seaborn as sns
     from sklearn.model_selection import train_test_split, cross_val_score
     from imblearn.over_sampling import SMOTE
     from xgboost import XGBClassifier
     from sklearn.preprocessing import StandardScaler
     from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
     from sklearn.metrics import mean_squared_error, f1_score, precision_score, recall_score
    
  3. Loading of Dataset:
     Main_Data = pd.read_csv('data.csv')
    
  4. EDAs:
    • Size and Shape of Data:
      print('The size of the Train_Riders data is :', Main_Data.size)
      print("Dimension: {}".format(Main_Data.shape))
      
    • Summary Statistics:
       Stat_of_Main_Data = Numeric_Data.describe(include='all')
       Stat_of_Main_Data = Stat_of_Main_Data.transpose()
      
    • Checking for Outliers:
       def Detecting_Outliers(Data):
       threshold=1
       mean = np.mean(Data) # computing mean of the dataset
       std =np.std(Data)  # computing the standard deviation
       for y in Data:
        z_score= (y - mean)/std
        if np.abs(z_score) > threshold:
            Outliers.append(y)        
       return Outliers
      fig=plt.figure()
      for i  in range(0,len(Main_Data.columns)):
       Outliers=[]
       if Main_Data[Main_Data.columns[i]].dtypes== np.int64 or Main_Data[Main_Data.columns[i]].dtypes== np.int64  :
        Outliers=Detecting_Outliers(Main_Data.iloc[:,i])
        if len(Outliers)>0:               
            ax = plt.subplot(1,1,1)
            ax.boxplot(Main_Data[list(Main_Data.columns)[i]])
            plt.title(list(Main_Data.columns)[i] + " Outlier Plot")
            plt.show()
      
    • Data Visualization: Performing KDE:
      plt.figure(figsize=(number_of_columns, 8*number_of_rows))
      for i in range(0,len(lent)):
       plt.subplot(number_of_rows + 1, number_of_columns,i+1)
       chart=sns.distplot(Numeric_Data[lent[i]], kde=True)
       chart.set_xticklabels(chart.get_xticklabels(), rotation=None,
                           horizontalalignment='right')
      plt.show()
      
    • Data preprocessing:
      • Checking for Missing Values
      • Visualization of the Missing Dataset
         import matplotlib.style as style
         style.use('ggplot')
         sns.heatmap(Main_Data.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')