import numpy as np
import pandas as pd

# %matplotlib nbagg
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set(font_scale=1.5)
sns.set_style('whitegrid') #whitegrid
sns.set_palette('muted')

# 한글사용가능하게하기
from matplotlib import font_manager
font_name = font_manager.FontProperties(fname='C:/Windows/Fonts/malgun.ttf').get_name()
matplotlib.rc('font', family=font_name)
print (plt.rcParams['font.family'] )

['Malgun Gothic']

데이터 로딩

import pandas as pd
train = pd.read_csv('data/titanic/train.csv')
test = pd.read_csv('data/titanic/test.csv')

데이터 확인

train과 test 비교후, test에 없는 칼럼이 종속변수
head()와 info()를 비교하여 범주형과 숫자형의 type이 제대로 되었는지 확인하기

print(train.shape)
print(train.columns)
train.head()

(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

print(test.shape)
print(test.columns)
test.head()

(418, 11)
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

칼럼명 소문자로 일괄변경

방법1 : 딕셔너리 형태로 변경( 일부분만 가능)

train = \
train.rename( columns = {
    'PassengerId' : 'passengerid',
    'Pclass' : 'pclass' ,
    'Name' : 'name',
    'Sex' : 'sex',
    'Age' : 'age',
    'SibSp' : 'sibsp',
    'Ticket' : 'ticket' ,
    'Fare' : 'fare',
    'Cabin' : 'cabin',
    'Embarked' : 'embarked',
    'Parch' : 'Parch'.lower() ,
    'Survived' : 'survived',
})

test = \
test.rename( columns = {
    'PassengerId' : 'passengerid',
    'Pclass' : 'pclass' ,
    'Name' : 'name',
    'Sex' : 'sex',
    'Age' : 'age',
    'SibSp' : 'sibsp',
    'Ticket' : 'ticket' ,
    'Fare' : 'fare',
    'Cabin' : 'cabin',
    'Embarked' : 'embarked',
        'Parch' : 'Parch'.lower() ,
#     'Survived' : 'survived',
})

리스트 컴프리헨션을 통한 한번에 소문자 처리

train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

[col.lower() for col in train.columns]

['passengerid',
 'survived',
 'pclass',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked']

train.columns = [col.lower() for col in train.columns]
train.head()

범주 <-> 숫자 type 제대로 되었는지 확인

Pclass 범주인데 숫자로
Survived 범주인데 숫자로
Parch 도
Sipsp 도

train['survived'] = train['survived'].astype( object )
train['pclass'] = train['pclass'].astype( object )

train['parch'] = train['parch'].astype( object )
train['sibsp'] = train['sibsp'].astype( object )

test['pclass'] = test['pclass'].astype( object )

train['parch'] = train['parch'].astype( object )
train['sibsp'] = train['sibsp'].astype( object )

Null 체크

train.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

train.isnull().sum().reset_index()

missing_df = train.isnull().sum().reset_index()
missing_df.columns = ['columns', 'count']
missing_df

missing_df = train.isnull().sum().reset_index()
missing_df.columns = ['columns', 'count']
missing_df['missing_rate'] = missing_df['count'] / train.shape[0] # 미싱 갯수 / 전체데이터의 데이터수 - shape(row,col)의 첫번째!!
missing_df

train

missing_df = train.isnull().sum().reset_index()
missing_df.columns = ['columns', 'count']
missing_df['missing_rate'] = missing_df['count'] / train.shape[0] # 미싱 갯수 / 전체데이터의 데이터수 - shape(row,col)의 첫번째!!
missing_df.loc[ missing_df['missing_rate'] != 0, :] # missing_Rate가 0이 아닌 것들만 가졍괴

test

missing_df = test.isnull().sum().reset_index()
missing_df.columns = ['columns', 'count']
missing_df['missing_rate'] = missing_df['count'] / test.shape[0] # 미싱 갯수 / 전체데이터의 데이터수 - shape(row,col)의 첫번째!!
missing_df.loc[ missing_df['missing_rate'] != 0, :] # missing_Rate가 0이 아닌 것들만 가졍괴

종속변수 확인 및 종속변수에 대한 다른변수들 관계 추론

종속변수의 범주별 빈도 체크

matplotlib 기본

train['survived'].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x259dad8add8>

sns

fig, ax = plt.subplots(1, 2,  # 1x2  Pieplot + barplot
                        figsize= (12,8))

# 1 X1  Pie plot
train['survived'].value_counts().plot.pie(
                                          explode = [0, 0.1],
                                          autopct = '%1.1f%%', # 퍼센트는 소수점1자리까지
                                          ax = ax[0],
                                          shadow = True 
                                         )
ax[0].set_title('Pie plot')
ax[0].set_ylabel('')

# # 1 X 2 bar plot (sns.countplot)
# train['survived'].value_counts().plot(
#                                       kind= 'bar',
#                                      ax = ax[1],
#                                      )
sns.countplot(
            'survived',      #칼럼명
             data = train, # df
             ax = ax[1]    # 좌표평면
             )

plt.tight_layout()

import relationship as r
r.categorical_all(train, 'survived')

0    549
1    342
Name: survived, dtype: int64

단변수 탐색1 : 범주형 칼럼의 빈도

범주형 칼럼명만 가져오기 & id와 종속변수 제외시키기

train['survived'].dtypes

train['survived'].dtypes == "object"

category_feature = [ col for col in train.columns if train[col].dtypes == "object"]
category_feature

['survived',
 'pclass',
 'name',
 'sex',
 'sibsp',
 'parch',
 'ticket',
 'cabin',
 'embarked']

id칼럼과 종속변수칼럼 제외시키기 by set

# set으로 마이너스 연산을 한 뒤, list()로 감싸서 다시 리스트로 만든다.
categorical_feature = list( set(category_feature)  - set( ['passengerid', 'survived']) )
categorical_feature

['embarked', 'ticket', 'parch', 'cabin', 'pclass', 'name', 'sex', 'sibsp']

범주칼럼명 리스트를 이용한 범주칼럼들 한꺼번에 빈도 보기

for col in categorical_feature:
    f, ax = plt.subplots(1, 1, figsize=(10,7))
    sns.countplot(
            col,                 #칼럼명
             data = train, # df
             )
    plt.title(col)
    plt.show() # 여러개 그림을 한셀에 띄울 때, plt.show()를 통해 하나씩 끊어주기!

함수화하기 ( 단변수 범주형 칼럼의 빈도 ) with 범주형 칼럼명 리스트

def categorical_all(df, categorical_feature):
    # 칼럼명 하나만 입력시
    if type(categorical_feature) is str:
        sns.countplot(
                categorical_feature,                 #칼럼명 1개
                 data = df,                                # df
                 )
        plt.title(categorical_feature)           #칼럼명 1개
        plt.tight_layout()
        plt.show() # 여러개 그림을 한셀에 띄울 때, plt.show()를 통해 하나씩 끊어주기!
        
    else:
        # 범주형 칼럼명 리스트가 들어왔을 때
        for col in categorical_feature:
            sns.countplot(
                    col,                 #칼럼명
                     data = df, # df
                     )
            plt.title(col)
            plt.tight_layout()
            plt.show() # 여러개 그림을 한셀에 띄울 때, plt.show()를 통해 하나씩 끊어주기!

categorical_all(train, 'cabin')

categorical_all(train, categorical_feature)

단변수 탐색2 : 숫자형 칼럼의 분포

숫자형 칼럼명 가져오기 = 전체칼럼 - 범주형칼럼 - (id+종속변수)

train.head()

numerical_feature = list(  set(train.columns) - set(categorical_feature) - set(['passengerid', 'survived']) )
numerical_feature

['age', 'fare']

np.sort(numerical_feature)

array(['age', 'fare'], dtype='<U4')

numerical_feature = list(  set(train.columns) - set(categorical_feature) - set(['passengerid', 'survived']) )
numerical_feature = np.sort(numerical_feature)
numerical_feature

array(['age', 'fare'], dtype='<U4')

숫자형 칼럼의 분포 sns.distplot 한꺼번에 그리기

for col in numerical_feature:
    print(train[col].describe())
    sns.distplot( train.loc[train[col].notnull(), col] ) # 해당칼럼 중 NaN아닌것만 인덱싱한 칼럼만 그리기
    plt.title(col)
    plt.show()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64

숫자형 칼럼의 분포2 boxplot

 
for col in numerical_feature:
    print(train[col].describe())
    sns.boxplot( data=train.loc[train[col].notnull(), col] )
    plt.title(col)
    plt.show()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64

숫자형 칼럼의 분포( hist(dist) + boxplot ) 한번에 그리기

for col in numerical_feature:
    
    # 숫자형 칼럼의 기술통계량
    print(train[col].describe())
    print(f"Skewed : {train[col].skew():.2f}")
        
    f, ax = plt.subplots(1, 2, figsize=(18,9))
    
    # dist
    sns.distplot( train.loc[train[col].notnull(), col],
                ax = ax[0] ) 
    ax[0].set_title(col +'\'s hist(dist) plot' )
    
    # boxplot
    sns.boxplot( data=train.loc[train[col].notnull(), col],
                ax = ax[1] )
    ax[1].set_title(col +'\'s boxplot')
    
    plt.tight_layout()
    plt.show()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64
Skewed : 0.39

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64
Skewed : 4.79

함수화하기 ( 단변수 숫자형 분포 dist+boxplot ) with 숫자형 칼럼명 리스트

def numerical_all(df, numerical_feature):
    for col in numerical_feature:

        # 숫자형 칼럼의 기술통계량
        print(df[col].describe())
        print(f"Skewed : {train[col].skew():.2f}")
        f, ax = plt.subplots(1, 2, figsize=(18,9))

        # dist
        sns.distplot( df.loc[train[col].notnull(), col],
                    ax = ax[0] ) 
        ax[0].set_title(col +'\'s hist(dist) plot' )
        ax[0].legend(loc='upper left', bbox_to_anchor=(1.0, 1.0) ) # 범례 밖으로 빼기

        # boxplot
        sns.boxplot( data=df.loc[df[col].notnull(), col],
                    ax = ax[1] )
        ax[1].set_title(col +'\'s boxplot')
        ax[1].legend(loc='upper left', bbox_to_anchor=(1.0, 1.0) ) # 범례 밖으로 빼기

        plt.tight_layout()
        plt.show()

이변수 탐색1 : 종속변수(범주)별 - 범주형 변수

방법1: `특정(종속)범주별 범주칼럼 인덱싱`하여서 각 범주별 series를 df로 합치기

survived = train.loc[ train['survived'] == 0,'sex' ].value_counts()
survived.name = 'Survived'

dead = train.loc[ train['survived'] == 1,'sex' ].value_counts()
dead.name = 'Dead'

# 시리즈를 리스트형식으로 df합치기
df = pd.DataFrame( [ survived, dead] )
df.head()

print(df)
df.plot(kind='bar')

          male  female
Survived   468      81
Dead       109     233

<matplotlib.axes._subplots.AxesSubplot at 0x2962005c550>

방법2 : pd.crosstab 으로 그리기

빈도

pd.crosstab( train['sex'], train['survived'])

pd.crosstab( train['sex'], train['survived']).plot(kind='barh',
                                                  stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x296212901d0>

pd.crosstab( train['sex'], train['survived'], 
           margins = True).style.background_gradient(cmap='summer_r')

비율

cross = pd.crosstab( train['sex'], train['survived'])
cross_ratio = cross / cross.sum()
cross_ratio

cross_ratio.plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x196cfa31ef0>

cross_ratio.transpose().plot(kind='bar', stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x196cfd34eb8>

방법3 : groupby( 범주) [ 종속변수].집계.unstack() - 빈도+평균도 가능 + 가상df로서 apply()로 비율계산가능함

빈도

sex_to_survived = train.groupby('sex') ['survived'].value_counts().unstack()
sex_to_survived

sex_to_survived.plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x196cf4b85f8>

비율 : groupby( ['범주' ,'종속']) ['종속'] 으로 비율 계산하기

비율을 계산하려면, 숫자칼럼 형태( 범주에서 counts() .unstack()도 먹인상태)
.count().unstack() : 범주별, 종속별, 종속의 범주마다 row단순 count -> 칼럼으로
.count().unstack() 한 것을 칼럼별로 계산하도록 df/df.sum() 으로 비율계산
.transpose() 를 통해 index = x축에 종속변수가 가야 비율이 가득찬다

dict(  list(  train.groupby(['sex', 'survived'])  )  ).keys()

dict_keys([('female', 0), ('female', 1), ('male', 0), ('male', 1)])

dict(  list(  train.groupby(['sex', 'survived'])  )  )['female', 0].head()

train.groupby('sex')['survived'].value_counts()

sex     survived
female  1           233
        0            81
male    0           468
        1           109
Name: survived, dtype: int64

cross_df = train.groupby('sex') ['survived'].value_counts().unstack()
cross_ratio = cross_df / cross_df.sum()
cross_ratio

cross_ratio.transpose().plot(kind='bar', stacked=True, figsize=(10,8))

<matplotlib.axes._subplots.AxesSubplot at 0x196d17e84a8>

방법4 : sns.countplot( '범주칼럼' , hue= '종속변수' ) -> 비율은 볼 수 없다

sns.countplot('sex',
             data = train,
             hue='survived')

<matplotlib.axes._subplots.AxesSubplot at 0x196cfa68080>

범주-종속(범주) 종속변수별 범주의 비율 한번에 그리기

cross_df = train.groupby('pclass')['survived'].value_counts().unstack()
# 세로로 합하고, 칼럼별로 나누어 비율계산하기
print(pd.DataFrame(cross_df))
cross_ratio = cross_df/ cross_df.sum()
# 세로방향기준으로 각 비율이 나눠져있는데, 그냥 그리면 row별로 x축에 틱이 찍힘
# 뒤집기
cross_ratio = cross_ratio.transpose()
cross_ratio.plot(kind='bar',
                stacked=True,
                figsize=(10,5))

survived    0    1
pclass            
1          80  136
2          97   87
3         372  119

<matplotlib.axes._subplots.AxesSubplot at 0x1dd55006e48>

종속-범주형변수 관계 함수정의하기

def cate_to_categorical(data,name_c,  name_d ):
    # index 범주형 변수/ col 종속 변수
    cross_df = data.groupby( name_c)[name_d].value_counts().unstack()
    # 세로로 합하고, 칼럼별로 나누어 비율계산하기
    print(cross_df)
    cross_ratio = cross_df/ cross_df.sum()
    # 세로방향기준으로 각 비율이 나눠져있는데, 그냥 그리면 row별로 x축에 틱이 찍힘
    # 뒤집기
    cross_ratio = cross_ratio.transpose()
    
    f, ax = plt.subplots(1, 2, figsize=(18,9))
    
    cross_df.plot(kind='bar',
                 ax=ax[0])
    ax[0].set_title(name_c +' to ' +name_d )
    ax[0].legend(loc='upper left', bbox_to_anchor=(1.0, 1.0) ) # 범례 밖으로 빼기
    
    cross_ratio.plot(kind='bar',
                     ax = ax[1],
                    stacked=True)
    ax[1].set_title(name_d +' to '+name_c +' ratio' )
    ax[1].legend(loc='upper left', bbox_to_anchor=(1.0, 1.0)) # 범례 밖으로 빼기
    
    plt.tight_layout()
    plt.show() # for문에 들어갈 것 대비 끝내기

relationship_new.cate_to_categorical(train, 'survived', 'sex')

survived    0    1
sex               
female     81  233
male      468  109

cate_to_categorical(train, 'survived', 'pclass')

survived    0    1
pclass            
1          80  136
2          97   87
3         372  119

import relationship_new

relationship_new.cate_to_categorical(train, 'survived', 'pclass')

survived    0    1
pclass            
1          80  136
2          97   87
3         372  119

relationship_new.cate_to_categorical(train, 'survived', 'parch')

survived      0      1
parch                 
0         445.0  233.0
1          53.0   65.0
2          40.0   40.0
3           2.0    3.0
4           4.0    NaN
5           4.0    1.0
6           1.0    NaN

for+함수(범주-범주)문을 통해 모든 종속칼럼에 대해 모든 범주칼럼의 분포와 비율 그려보기

범주가 너무 많을때는 금지!! feature engineering

for col in categorical_feature:
    relationship_new.cate_to_categorical(train, 'survived', col)

survived    0    1
embarked          
C          75   93
Q          47   30
S         427  217

survived    0    1
sex               
female     81  233
male      468  109

survived                                           0    1
name                                                     
Abbing, Mr. Anthony                              1.0  NaN
Abbott, Mr. Rossmore Edward                      1.0  NaN
Abbott, Mrs. Stanton (Rosa Hunt)                 NaN  1.0
Abelson, Mr. Samuel                              1.0  NaN
Abelson, Mrs. Samuel (Hannah Wizosky)            NaN  1.0
Adahl, Mr. Mauritz Nils Martin                   1.0  NaN
Adams, Mr. John                                  1.0  NaN
Ahlin, Mrs. Johan (Johanna Persdotter Larsson)   1.0  NaN
Aks, Mrs. Sam (Leah Rosen)                       NaN  1.0
Albimona, Mr. Nassef Cassem                      NaN  1.0
Alexander, Mr. William                           1.0  NaN
Alhomaki, Mr. Ilmari Rudolf                      1.0  NaN
Ali, Mr. Ahmed                                   1.0  NaN
Ali, Mr. William                                 1.0  NaN
Allen, Miss. Elisabeth Walton                    NaN  1.0
Allen, Mr. William Henry                         1.0  NaN
Allison, Master. Hudson Trevor                   NaN  1.0
Allison, Miss. Helen Loraine                     1.0  NaN
Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  1.0  NaN
Allum, Mr. Owen George                           1.0  NaN
Andersen-Jensen, Miss. Carla Christine Nielsine  NaN  1.0
Anderson, Mr. Harry                              NaN  1.0
Andersson, Master. Sigvard Harald Elias          1.0  NaN
Andersson, Miss. Ebba Iris Alfrida               1.0  NaN
Andersson, Miss. Ellis Anna Maria                1.0  NaN
Andersson, Miss. Erna Alexandra                  NaN  1.0
Andersson, Miss. Ingeborg Constanzia             1.0  NaN
Andersson, Miss. Sigrid Elisabeth                1.0  NaN
Andersson, Mr. Anders Johan                      1.0  NaN
Andersson, Mr. August Edvard ("Wennerstrom")     NaN  1.0
...                                              ...  ...
Widegren, Mr. Carl/Charles Peter                 1.0  NaN
Widener, Mr. Harry Elkins                        1.0  NaN
Wiklund, Mr. Jakob Alfred                        1.0  NaN
Wilhelms, Mr. Charles                            NaN  1.0
Willey, Mr. Edward                               1.0  NaN
Williams, Mr. Charles Duane                      1.0  NaN
Williams, Mr. Charles Eugene                     NaN  1.0
Williams, Mr. Howard Hugh "Harry"                1.0  NaN
Williams, Mr. Leslie                             1.0  NaN
Williams-Lambert, Mr. Fletcher Fellows           1.0  NaN
Windelov, Mr. Einar                              1.0  NaN
Wiseman, Mr. Phillippe                           1.0  NaN
Woolner, Mr. Hugh                                NaN  1.0
Wright, Mr. George                               1.0  NaN
Yasbeck, Mr. Antoni                              1.0  NaN
Yasbeck, Mrs. Antoni (Selini Alexander)          NaN  1.0
Young, Miss. Marie Grice                         NaN  1.0
Youseff, Mr. Gerious                             1.0  NaN
Yousif, Mr. Wazli                                1.0  NaN
Yousseff, Mr. Gerious                            1.0  NaN
Yrois, Miss. Henriette ("Mrs Harbeck")           1.0  NaN
Zabour, Miss. Hileni                             1.0  NaN
Zabour, Miss. Thamine                            1.0  NaN
Zimmerman, Mr. Leo                               1.0  NaN
de Messemaeker, Mrs. Guillaume Joseph (Emma)     NaN  1.0
de Mulder, Mr. Theodore                          NaN  1.0
de Pelsmaeker, Mr. Alfons                        1.0  NaN
del Carlo, Mr. Sebastiano                        1.0  NaN
van Billiard, Mr. Austin Blyler                  1.0  NaN
van Melkebeke, Mr. Philemon                      1.0  NaN

[891 rows x 2 columns]

C:\Users\is2js\Anaconda3\lib\site-packages\matplotlib\tight_layout.py:181: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
  warnings.warn('Tight layout not applied. '

survived    0    1
pclass            
1          80  136
2          97   87
3         372  119

survived    0    1
cabin             
A10       1.0  NaN
A14       1.0  NaN
A16       NaN  1.0
A19       1.0  NaN
A20       NaN  1.0
A23       NaN  1.0
A24       1.0  NaN
A26       NaN  1.0
A31       NaN  1.0
A32       1.0  NaN
A34       NaN  1.0
A36       1.0  NaN
A5        1.0  NaN
A6        NaN  1.0
A7        1.0  NaN
B101      NaN  1.0
B102      1.0  NaN
B18       NaN  2.0
B19       1.0  NaN
B20       NaN  2.0
B22       1.0  1.0
B28       NaN  2.0
B3        NaN  1.0
B30       1.0  NaN
B35       NaN  2.0
B37       1.0  NaN
B38       1.0  NaN
B39       NaN  1.0
B4        NaN  1.0
B41       NaN  1.0
...       ...  ...
E12       NaN  1.0
E121      NaN  2.0
E17       NaN  1.0
E24       NaN  2.0
E25       NaN  2.0
E31       1.0  NaN
E33       NaN  2.0
E34       NaN  1.0
E36       NaN  1.0
E38       1.0  NaN
E40       NaN  1.0
E44       1.0  1.0
E46       1.0  NaN
E49       NaN  1.0
E50       NaN  1.0
E58       1.0  NaN
E63       1.0  NaN
E67       1.0  1.0
E68       NaN  1.0
E77       1.0  NaN
E8        NaN  2.0
F E69     NaN  1.0
F G63     1.0  NaN
F G73     2.0  NaN
F2        1.0  2.0
F33       NaN  3.0
F38       1.0  NaN
F4        NaN  2.0
G6        2.0  2.0
T         1.0  NaN

[147 rows x 2 columns]

C:\Users\is2js\Anaconda3\lib\site-packages\matplotlib\tight_layout.py:181: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
  warnings.warn('Tight layout not applied. '

survived             0    1
ticket                     
110152             NaN  3.0
110413             1.0  2.0
110465             2.0  NaN
110564             NaN  1.0
110813             NaN  1.0
111240             1.0  NaN
111320             1.0  NaN
111361             NaN  2.0
111369             NaN  1.0
111426             NaN  1.0
111427             NaN  1.0
111428             NaN  1.0
112050             1.0  NaN
112052             1.0  NaN
112053             NaN  1.0
112058             1.0  NaN
112059             1.0  NaN
112277             NaN  1.0
112379             1.0  NaN
113028             1.0  NaN
113043             1.0  NaN
113050             1.0  NaN
113051             1.0  NaN
113055             NaN  1.0
113056             1.0  NaN
113059             1.0  NaN
113501             1.0  NaN
113503             1.0  NaN
113505             NaN  2.0
113509             1.0  NaN
...                ...  ...
SOTON/OQ 392082    1.0  NaN
SOTON/OQ 392086    1.0  NaN
SOTON/OQ 392089    NaN  1.0
SOTON/OQ 392090    1.0  NaN
STON/O 2. 3101269  NaN  1.0
STON/O 2. 3101273  1.0  NaN
STON/O 2. 3101274  1.0  NaN
STON/O 2. 3101275  1.0  NaN
STON/O 2. 3101280  1.0  NaN
STON/O 2. 3101285  NaN  1.0
STON/O 2. 3101286  NaN  1.0
STON/O 2. 3101288  NaN  1.0
STON/O 2. 3101289  NaN  1.0
STON/O 2. 3101292  1.0  NaN
STON/O 2. 3101293  1.0  NaN
STON/O 2. 3101294  1.0  NaN
STON/O2. 3101271   1.0  NaN
STON/O2. 3101279   1.0  1.0
STON/O2. 3101282   NaN  1.0
STON/O2. 3101283   NaN  1.0
STON/O2. 3101290   1.0  NaN
SW/PP 751          NaN  1.0
W./C. 14258        NaN  1.0
W./C. 14263        1.0  NaN
W./C. 6607         2.0  NaN
W./C. 6608         4.0  NaN
W./C. 6609         1.0  NaN
W.E.P. 5734        1.0  NaN
W/C 14208          1.0  NaN
WE/P 5735          1.0  1.0

[681 rows x 2 columns]

C:\Users\is2js\Anaconda3\lib\site-packages\matplotlib\tight_layout.py:181: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
  warnings.warn('Tight layout not applied. '

이변수 탐색2 숫자-숫자형의 산점도행렬 with 종속변수

numerical_feature

array(['age', 'fare', 'parch', 'sibsp'], dtype='<U5')

# list는 (-)연산은 안되도 (+) 연산은 된다.
list(numerical_feature) + ['survived']

['age', 'fare', 'parch', 'sibsp', 'survived']

sns.pairplot(  train[ list(numerical_feature) + ['survived'] ],
              hue = 'survived',
            x_vars = numerical_feature,
            y_vars = numerical_feature)

C:\Users\is2js\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:448: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.
C:\Users\is2js\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:448: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X > clip[0], X < clip[1])] # won't work for two columns.

<seaborn.axisgrid.PairGrid at 0x296230e9828>

함수화( 숫자 - 숫자의 산점도 행렬 : pairplot) with 종속변수(범례) by 숫자형칼럼명리스트

type(numerical_feature)

numpy.ndarray

list().dtype

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-119-0901fc39190d> in <module>
----> 1 list().dtype

AttributeError: 'list' object has no attribute 'dtype'

numerical_feature

array(['age', 'fare'], dtype='<U4')

type(list(numerical_feature)) is list

True

len(numerical_feature)

2

def num_to_numerical(df, numerical_feature, name_d=None):
    
    # 종속변수 없을 때
    if name_d == None:
        sns.pairplot(  df[ list(numerical_feature) ] , # 숫자형칼럼들만 인덱싱
                    x_vars = numerical_feature,
                    y_vars = numerical_feature )
        

        plt.tight_layout()
        
    # 숫자 - 숫자 산점도 행렬 with 종속변수 범례
    else:
        sns.pairplot(  df[ list(numerical_feature) + [name_d]] , # 숫자형칼럼들 + 종속변수 인덱싱
                      hue = name_d,                                                  # 범례(색)으로 종속변수 추가
                    x_vars = numerical_feature, 
                    y_vars = numerical_feature )
        
        plt.tight_layout()

num_to_numerical(train, {'age', 'fare'}, 'survived')

이변수 탐색3 : 범주별 - 숫자형 칼럼(들)의 분포(boxplot) with 종속변수(범례) by 숫자형칼럼명리스트

범주별 숫자칼럼1개의 분포(boxplot)

print(train.groupby('sex')['age'].describe())
plt.figure(figsize=(18,9))
sns.boxplot(x='sex', y='age', data=train.dropna())
plt.title("Sex - { }".format('age'))
plt.show()

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-277-1a8c01f185d7> in <module>
      2 plt.figure(figsize=(18,9))
      3 sns.boxplot(x='sex', y='age', data=train.dropna())
----> 4 plt.title("Sex - { }".format('age'))
      5 plt.show()

KeyError: ' '

함수화(범주별 숫자칼럼1개의 분포(boxplot)) with 종속변수(범례)

def cate_to_num_box(df, name_c, name_n, name_d=None):
    # 범주별 기술통계량
    if name_d != None:
        print(df.groupby([name_c, name_d])[name_n].describe())
    else :
        print(df.groupby(name_c)[name_n].describe())
    
    # 범주별 boxplot - 범례는 선택
    plt.figure(figsize=(18,9))
    sns.boxplot(x=name_c, y=name_n, data=df.dropna(),
               hue = name_d)
    plt.title("{} to {}".format(name_c, name_n))
    
    plt.tight_layout()
    plt.show()

cate_to_num_box(train, 'sex', 'age')

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0

type(numerical_feature)

numpy.ndarray

type("age")

str

함수화( 범주별 - 숫자칼럼(들)의 분포(boxplot)) with 종속변수(범례)

def cate_to_num_box(df, name_c, numerical_feature, name_d=None):
    
    # 숫자형 칼럼명 1개 ( 칼럼명 = 문자열 1개 직접 입력)
    if type(numerical_feature) is str:
        # 범주별 기술통계량
        if name_d != None:
            print(df.groupby([name_c, name_d])[numerical_feature].describe())
        else :
            print(df.groupby(name_c)[numerical_feature].describe())

        # 범주별 boxplot - 범례는 선택
        plt.figure(figsize=(18,9))
        sns.boxplot(x=name_c, y=numerical_feature, data=df.dropna(),
                   hue = name_d)
        plt.title("{} to {}".format(name_c, numerical_feature))

        plt.tight_layout()
        plt.show()
    
    # 숫자형 칼럼명 리스트가 들어와서 for문을 돌면서 여러개 그릴 때
    else :
        for col in numerical_feature:
            if name_d != None:
                print(df.groupby([name_c, name_d])[col].describe())
            else :
                print(df.groupby(name_c)[col].describe())


            plt.figure(figsize=(18,9))

            sns.boxplot(x=name_c,                   # 범주칼럼명
                            y=col,                                   # 숫자칼럼명
                            data=df.dropna() ,               # df
                           hue = name_d )                     # 종속칼럼명(선택)
            plt.title("{} to {}".format(name_c, col))

            plt.tight_layout()
            plt.show()

cate_to_num_box(train, 'sex', 'age')

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0

cate_to_num_box(train, 'sex', 'age', 'pclass')

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0
               count       mean        std   min     25%   50%    75%   max
sex    pclass                                                              
female 1        85.0  34.611765  13.612052  2.00  23.000  35.0  44.00  63.0
       2        74.0  28.722973  12.872702  2.00  22.250  28.0  36.00  57.0
       3       102.0  21.750000  12.729964  0.75  14.125  21.5  29.75  63.0
male   1       101.0  41.281386  15.139570  0.92  30.000  40.0  51.00  80.0
       2        99.0  30.740707  14.793894  0.67  23.000  30.0  36.75  70.0
       3       253.0  26.507589  12.159514  0.42  20.000  25.0  33.00  74.0

numerical_feature

array(['age', 'fare'], dtype='<U4')

cate_to_num_box(train, 'sex', numerical_feature, 'pclass')

               count       mean        std   min     25%   50%    75%   max
sex    pclass                                                              
female 1        85.0  34.611765  13.612052  2.00  23.000  35.0  44.00  63.0
       2        74.0  28.722973  12.872702  2.00  22.250  28.0  36.00  57.0
       3       102.0  21.750000  12.729964  0.75  14.125  21.5  29.75  63.0
male   1       101.0  41.281386  15.139570  0.92  30.000  40.0  51.00  80.0
       2        99.0  30.740707  14.793894  0.67  23.000  30.0  36.75  70.0
       3       253.0  26.507589  12.159514  0.42  20.000  25.0  33.00  74.0

               count        mean        std      min       25%       50%  \
sex    pclass                                                              
female 1        94.0  106.125798  74.259988  25.9292  57.24480  82.66455   
       2        76.0   21.970121  10.891796  10.5000  13.00000  22.00000   
       3       144.0   16.118810  11.690314   6.7500   7.85420  12.47500   
male   1       122.0   67.226127  77.548021   0.0000  27.72810  41.26250   
       2       108.0   19.741782  14.922235   0.0000  12.33125  13.00000   
       3       347.0   12.661633  11.681696   0.0000   7.75000   7.92500   

                      75%       max  
sex    pclass                        
female 1       134.500000  512.3292  
       2        26.062500   65.0000  
       3        20.221875   69.5500  
male   1        78.459375  512.3292  
       2        26.000000   73.5000  
       3        10.008300   69.5500

이변수 탐색4 : 범주별 숫자형칼럼의 범위별 kde분포 탐색

facet = sns.FacetGrid(train, hue = 'survived', height = 4.5,aspect=4)
facet.map(sns.kdeplot, 'age', shade=True)
facet.set(xlim = (0, train['age'].max())) 
facet.add_legend()

<seaborn.axisgrid.FacetGrid at 0x2241e1602b0>

facet = sns.FacetGrid(train, hue = 'survived', aspect=4)
facet.map(sns.kdeplot, 'age', shade=True)
facet.set(xlim = (0, train['age'].max())) 
facet.add_legend() 
plt.xlim([10,20])

(10, 20)

함수화

# ( dataFrame, '범주형 칼럼명' , '숫자형 칼럼명'  ) or
# ( dataFrame, '범주형 칼럼명' , '숫자형 칼럼명' , xlim = [ a, b ] )
def cate_to_num_kde(df, name_c, name_n, xlim = None ):
    print(df.groupby(name_c)[name_n].describe())
    facet = sns.FacetGrid(df.dropna(), hue = name_c, aspect=4)    #  df 및 범주형 변수 설정 및 가로길이(aspect)
    facet.map(sns.kdeplot, name_n, shade=True)                            # kdeplot 설정 및 숫자형변수 지정
    facet.set(xlim = (df[name_n].min(), df[name_n].max()))   # 숫자형 변수의 최소값부터 최대값까지 x범위 최초 지정
    facet.add_legend() 
    if xlim != None:
        print( '범위지정 : {}'.format(xlim))
        plt.xlim(xlim) # xlim 지정시 그 범위만 출력

cate_to_num_kde(train, 'sex', 'age')

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0

cate_to_num_kde(train, 'sex', 'age', xlim = [20, 30])

        count       mean        std   min   25%   50%   75%   max
sex                                                              
female  261.0  27.915709  14.110146  0.75  18.0  27.0  37.0  63.0
male    453.0  30.726645  14.678201  0.42  21.0  29.0  39.0  80.0
범위지정 : [20, 30]

삼변수 탐색1 : 범주2(범례)별로보는 범주1(x축)별 --> 연속(종속)변수 의 비율

범주1이 종속변수에 끼는 영향에 대해 범주2(범례)의 영향이 있는지(크로스) 없는지(평행) 판단

# https://kaggle-kr.tistory.com/17?category=821486
sns.catplot('sex', 'survived', hue='pclass', data=train, 
            kind='point', # default kind - factorplot
            height=6, 
            aspect=1.5)

<seaborn.axisgrid.FacetGrid at 0x1dd548ba160>

함수화

def legend_to_cate_to_num(df, name_l, name_c, name_n):
    sns.catplot(name_c, name_n, hue=name_l, data=df, 
                kind='point', # default kind - factorplot
                height=6, 
                aspect=1.5)
    
    plt.tight_layout()
    plt.show()

legend_to_cate_to_num(train, 'sex', 'pclass', 'survived')

삼변수 탐색2 : 범례별(종속변수)별로 보는 범주별(x축) 숫자형(y축-바이올린)의 분포

sns.violinplot("pclass","age", hue="survived", data=train, 
               scale='count', split=True)

plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0) ) # 범례 밖으로 빼기
plt.tight_layout()
plt.show() # for문에 들어갈 것 대비

함수화(범례별(종속변수)별로 보는 범주별(x축) 숫자형(y축-바이올린)의 분포)

def legend_to_cate_to_num_violin(df,name_l, name_c, name_n):
    
    sns.violinplot(name_c,name_n, hue=name_l, data=df, 
               scale='count', split=True)

    plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0) ) # 범례 밖으로 빼기

    plt.tight_layout()
    plt.show() # for문에 들어갈 것 대비

legend_to_cate_to_num_violin(train, 'survived', 'sex', 'age')

categorical_all(train, 'embarked')

cate_to_categorical(train, 'survived', 'embarked')

survived    0    1
embarked          
C          75   93
Q          47   30
S         427  217

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	index	0
0	passengerid	0
1	survived	0
2	pclass	0
3	name	0
4	sex	0
5	age	177
6	sibsp	0
7	parch	0
8	ticket	0
9	fare	0
10	cabin	687
11	embarked	2

	columns	count	missing_rate
0	passengerid	0	0.000000
1	survived	0	0.000000
2	pclass	0	0.000000
3	name	0	0.000000
4	sex	0	0.000000
5	age	177	0.198653
6	sibsp	0	0.000000
7	parch	0	0.000000
8	ticket	0	0.000000
9	fare	0	0.000000
10	cabin	687	0.771044
11	embarked	2	0.002245

	passengerid	pclass	name	sex	age	sibSp	parch	ticket	fare	cabin	embarked
14	15	3	Vestrom, Miss. Hulda Amanda Adolfina	female	14.0	0	0	350406	7.8542	NaN	S
18	19	3	Vander Planke, Mrs. Julius (Emelia Maria Vande...	female	31.0	1	0	345763	18.0000	NaN	S
24	25	3	Palsson, Miss. Torborg Danira	female	8.0	3	1	349909	21.0750	NaN	S
38	39	3	Vander Planke, Miss. Augusta Maria	female	18.0	2	0	345764	18.0000	NaN	S
40	41	3	Ahlin, Mrs. Johan (Johanna Persdotter Larsson)	female	40.0	1	0	7546	9.4750	NaN	S

survived	0	1
sex
female	0.147541	0.681287
male	0.852459	0.318713

survived	0	1
sex
female	0.147541	0.681287
male	0.852459	0.318713

📜 제목으로 보기

✏마지막 댓글로

데이터 로딩

데이터 확인

칼럼명 소문자로 일괄변경

방법1 : 딕셔너리 형태로 변경( 일부분만 가능)

리스트 컴프리헨션을 통한 한번에 소문자 처리

범주 <-> 숫자 type 제대로 되었는지 확인

Null 체크

train

test

종속변수 확인 및 종속변수에 대한 다른변수들 관계 추론

종속변수의 범주별 빈도 체크

matplotlib 기본

sns

단변수 탐색1 : 범주형 칼럼의 빈도

범주형 칼럼명만 가져오기 & id와 종속변수 제외시키기

id칼럼과 종속변수칼럼 제외시키기 by set

범주칼럼명 리스트를 이용한 범주칼럼들 한꺼번에 빈도 보기

함수화하기 ( 단변수 범주형 칼럼의 빈도 ) with 범주형 칼럼명 리스트

단변수 탐색2 : 숫자형 칼럼의 분포

숫자형 칼럼명 가져오기 = 전체칼럼 - 범주형칼럼 - (id+종속변수)

숫자형 칼럼의 분포 sns.distplot 한꺼번에 그리기

숫자형 칼럼의 분포2 boxplot

숫자형 칼럼의 분포( hist(dist) + boxplot ) 한번에 그리기

함수화하기 ( 단변수 숫자형 분포 dist+boxplot ) with 숫자형 칼럼명 리스트

이변수 탐색1 : 종속변수(범주)별 - 범주형 변수

방법1: 특정(종속)범주별 범주칼럼 인덱싱하여서 각 범주별 series를 df로 합치기

방법2 : pd.crosstab 으로 그리기

빈도

비율

방법3 : groupby( 범주) [ 종속변수].집계.unstack() - 빈도+평균도 가능 + 가상df로서 apply()로 비율계산가능함

빈도

비율 : groupby( ['범주' ,'종속']) ['종속'] 으로 비율 계산하기

방법4 : sns.countplot( '범주칼럼' , hue= '종속변수' ) -> 비율은 볼 수 없다

범주-종속(범주) 종속변수별 범주의 비율 한번에 그리기

종속-범주형변수 관계 함수정의하기

for+함수(범주-범주)문을 통해 모든 종속칼럼에 대해 모든 범주칼럼의 분포와 비율 그려보기

이변수 탐색2 숫자-숫자형의 산점도행렬 with 종속변수

함수화( 숫자 - 숫자의 산점도 행렬 : pairplot) with 종속변수(범례) by 숫자형칼럼명리스트

이변수 탐색3 : 범주별 - 숫자형 칼럼(들)의 분포(boxplot) with 종속변수(범례) by 숫자형칼럼명리스트

범주별 숫자칼럼1개의 분포(boxplot)

함수화(범주별 숫자칼럼1개의 분포(boxplot)) with 종속변수(범례)

함수화( 범주별 - 숫자칼럼(들)의 분포(boxplot)) with 종속변수(범례)

이변수 탐색4 : 범주별 숫자형칼럼의 범위별 kde분포 탐색

함수화

삼변수 탐색1 : 범주2(범례)별로보는 범주1(x축)별 --> 연속(종속)변수 의 비율

함수화

삼변수 탐색2 : 범례별(종속변수)별로 보는 범주별(x축) 숫자형(y축-바이올린)의 분포

함수화(범례별(종속변수)별로 보는 범주별(x축) 숫자형(y축-바이올린)의 분포)

댓글 끝

방법1: `특정(종속)범주별 범주칼럼 인덱싱`하여서 각 범주별 series를 df로 합치기