playdata
Pandas(0728_day4) - 실습_자동차 연비 분석 (데이터 전처리 및 시각화)
_JAEJAE_
2021. 7. 28. 22:25
자동차 연비 데이터셋¶
데이터 살펴보기¶
- 데이터 적재
In [2]:
import pandas as pd
In [29]:
df = pd.read_csv('./auto-mpg.csv', header=None)
- -- original data loading test--
In [17]:
df2 = pd.read_csv('./auto-mpg_o.csv', sep="\s+", header=None)
df2.head()
In [25]:
df3 = pd.read_table('./auto-mpg.data-original.txt', sep="\s+", header=None)
df3.head()
Out[25]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8.0 | 307.0 | 130.0 | 3504.0 | 12.0 | 70.0 | 1.0 | chevrolet chevelle malibu |
1 | 15.0 | 8.0 | 350.0 | 165.0 | 3693.0 | 11.5 | 70.0 | 1.0 | buick skylark 320 |
2 | 18.0 | 8.0 | 318.0 | 150.0 | 3436.0 | 11.0 | 70.0 | 1.0 | plymouth satellite |
3 | 16.0 | 8.0 | 304.0 | 150.0 | 3433.0 | 12.0 | 70.0 | 1.0 | amc rebel sst |
4 | 17.0 | 8.0 | 302.0 | 140.0 | 3449.0 | 10.5 | 70.0 | 1.0 | ford torino |
- ---- end -----------
- 열 이름을 지정
In [3]:
#mpg : 연비
#cylinders : 실린더수
#displacement : 배기량
#horsepower: 출력
#weight : 차중
#acceleration : 가속능력
#model year : 출시년도
#origin : 제조국 1(USA), 2(EU), 3(JPN)
#name : 모델명
In [30]:
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model year','origin','name']
- 데이터프레임 df의 내용을 일부 확인
In [31]:
df.head(2)
Out[31]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | 1 | buick skylark 320 |
In [32]:
df.tail()
Out[32]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
393 | 27.0 | 4 | 140.0 | 86.00 | 2790.0 | 15.6 | 82 | 1 | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.00 | 2130.0 | 24.6 | 82 | 2 | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.00 | 2295.0 | 11.6 | 82 | 1 | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.00 | 2625.0 | 18.6 | 82 | 1 | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.00 | 2720.0 | 19.4 | 82 | 1 | chevy s-10 |
- df의 모양과 크기 확인
In [33]:
df.shape
Out[33]:
(398, 9)
- df의 내용 확인
In [34]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cylinders 398 non-null int64 2 displacement 398 non-null float64 3 horsepower 398 non-null object 4 weight 398 non-null float64 5 acceleration 398 non-null float64 6 model year 398 non-null int64 7 origin 398 non-null int64 8 name 398 non-null object dtypes: float64(4), int64(3), object(2) memory usage: 28.1+ KB
- df의 자료형 확인
In [35]:
df.dtypes
Out[35]:
mpg float64 cylinders int64 displacement float64 horsepower object weight float64 acceleration float64 model year int64 origin int64 name object dtype: object
- 시리즈(mpg 열)의 자료형 확인
In [36]:
df.mpg.dtypes
Out[36]:
dtype('float64')
- df의 기술통계 정보 확인
In [37]:
df.describe()
Out[37]:
mpg | cylinders | displacement | weight | acceleration | model year | origin | |
---|---|---|---|---|---|---|---|
count | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398.000000 |
mean | 23.514573 | 5.454774 | 193.425879 | 2970.424623 | 15.568090 | 76.010050 | 1.572864 |
std | 7.815984 | 1.701004 | 104.269838 | 846.841774 | 2.757689 | 3.697627 | 0.802055 |
min | 9.000000 | 3.000000 | 68.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
25% | 17.500000 | 4.000000 | 104.250000 | 2223.750000 | 13.825000 | 73.000000 | 1.000000 |
50% | 23.000000 | 4.000000 | 148.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
75% | 29.000000 | 8.000000 | 262.000000 | 3608.000000 | 17.175000 | 79.000000 | 2.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
In [ ]:
# include='all' 옵션 사용
In [38]:
df.describe(include='all')
Out[38]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
count | 398.000000 | 398.000000 | 398.000000 | 398 | 398.000000 | 398.000000 | 398.000000 | 398.000000 | 398 |
unique | NaN | NaN | NaN | 94 | NaN | NaN | NaN | NaN | 305 |
top | NaN | NaN | NaN | 150.0 | NaN | NaN | NaN | NaN | ford pinto |
freq | NaN | NaN | NaN | 22 | NaN | NaN | NaN | NaN | 6 |
mean | 23.514573 | 5.454774 | 193.425879 | NaN | 2970.424623 | 15.568090 | 76.010050 | 1.572864 | NaN |
std | 7.815984 | 1.701004 | 104.269838 | NaN | 846.841774 | 2.757689 | 3.697627 | 0.802055 | NaN |
min | 9.000000 | 3.000000 | 68.000000 | NaN | 1613.000000 | 8.000000 | 70.000000 | 1.000000 | NaN |
25% | 17.500000 | 4.000000 | 104.250000 | NaN | 2223.750000 | 13.825000 | 73.000000 | 1.000000 | NaN |
50% | 23.000000 | 4.000000 | 148.500000 | NaN | 2803.500000 | 15.500000 | 76.000000 | 1.000000 | NaN |
75% | 29.000000 | 8.000000 | 262.000000 | NaN | 3608.000000 | 17.175000 | 79.000000 | 2.000000 | NaN |
max | 46.600000 | 8.000000 | 455.000000 | NaN | 5140.000000 | 24.800000 | 82.000000 | 3.000000 | NaN |
In [ ]:
# name 열의 기술 통계 정보 확인
In [39]:
df.name.describe()
Out[39]:
count 398 unique 305 top ford pinto freq 6 Name: name, dtype: object
In [ ]:
# mpg 열의 기술 통계 정보 확인
In [40]:
df.mpg.describe()
Out[40]:
count 398.000000 mean 23.514573 std 7.815984 min 9.000000 25% 17.500000 50% 23.000000 75% 29.000000 max 46.600000 Name: mpg, dtype: float64
데이터 개수 확인¶
- df의 각 열이 가지고 있는 원소 개수 확인
In [41]:
df.count()
Out[41]:
mpg 398 cylinders 398 displacement 398 horsepower 398 weight 398 acceleration 398 model year 398 origin 398 name 398 dtype: int64
- df.count()가 반환하는 객체 타입 출력
In [42]:
type(df.count())
Out[42]:
pandas.core.series.Series
- df의 특정 열('origin')이 가지고 있는 고유값 확인
In [43]:
df['origin'].value_counts()
Out[43]:
1 249 3 79 2 70 Name: origin, dtype: int64
In [ ]:
#df의 특정 열('origin')의 histogram 그리기 - series.hist() 함수
In [ ]:
df['origin'].plot(kind='hist', grid=True)
In [20]:
df['origin'].hist()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x16106c04970>
통계 함수¶
- 평균값
In [21]:
df.mean()
Out[21]:
mpg 23.514573 cylinders 5.454774 displacement 193.425879 weight 2970.424623 acceleration 15.568090 model year 76.010050 origin 1.572864 dtype: float64
- 특정 열('mpg')의 평균값
In [22]:
df['mpg'].mean()
Out[22]:
23.514572864321615
In [23]:
df.mpg.mean()
Out[23]:
23.514572864321615
- 특정 열들('mpg','weight')의 평균값
In [24]:
df[['mpg','weight']].mean()
Out[24]:
mpg 23.514573 weight 2970.424623 dtype: float64
- 중간값
In [25]:
df.median()
Out[25]:
mpg 23.0 cylinders 4.0 displacement 148.5 weight 2803.5 acceleration 15.5 model year 76.0 origin 1.0 dtype: float64
- 특정 열('mpg')의 중간값
In [26]:
df['mpg'].median()
Out[26]:
23.0
- 최대값
In [27]:
df.max()
Out[27]:
mpg 46.6 cylinders 8 displacement 455 horsepower ? weight 5140 acceleration 24.8 model year 82 origin 3 name vw rabbit custom dtype: object
- 특정 열('mpg')의 최대값
In [28]:
df['mpg'].max()
Out[28]:
46.6
- 최소값
In [29]:
df.min()
Out[29]:
mpg 9 cylinders 3 displacement 68 horsepower 100.0 weight 1613 acceleration 8 model year 70 origin 1 name amc ambassador brougham dtype: object
- 특정 열('mpg')의 최소값
In [30]:
df['mpg'].min()
Out[30]:
9.0
- 표준편차
In [31]:
df.std()
Out[31]:
mpg 7.815984 cylinders 1.701004 displacement 104.269838 weight 846.841774 acceleration 2.757689 model year 3.697627 origin 0.802055 dtype: float64
- 특정 열('mpg')의 표준편차
In [32]:
df['mpg'].std()
Out[32]:
7.815984312565782
- 상관계수
In [33]:
df.corr()
Out[33]:
mpg | cylinders | displacement | weight | acceleration | model year | origin | |
---|---|---|---|---|---|---|---|
mpg | 1.000000 | -0.775396 | -0.804203 | -0.831741 | 0.420289 | 0.579267 | 0.563450 |
cylinders | -0.775396 | 1.000000 | 0.950721 | 0.896017 | -0.505419 | -0.348746 | -0.562543 |
displacement | -0.804203 | 0.950721 | 1.000000 | 0.932824 | -0.543684 | -0.370164 | -0.609409 |
weight | -0.831741 | 0.896017 | 0.932824 | 1.000000 | -0.417457 | -0.306564 | -0.581024 |
acceleration | 0.420289 | -0.505419 | -0.543684 | -0.417457 | 1.000000 | 0.288137 | 0.205873 |
model year | 0.579267 | -0.348746 | -0.370164 | -0.306564 | 0.288137 | 1.000000 | 0.180662 |
origin | 0.563450 | -0.562543 | -0.609409 | -0.581024 | 0.205873 | 0.180662 | 1.000000 |
- 'mpg'와 'weight'의 상관 계수
In [34]:
df[['mpg','weight']].corr()
Out[34]:
mpg | weight | |
---|---|---|
mpg | 1.000000 | -0.831741 |
weight | -0.831741 | 1.000000 |
In [35]:
df.plot(kind="scatter", x='weight', y='mpg')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1610738ca30>
데이터 전처리¶
- 데이터 구간 분할
In [48]:
import pandas as pd
import numpy as np
df = pd.read_csv('./auto-mpg.csv', header=None)
# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model year','origin','name']
In [49]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cylinders 398 non-null int64 2 displacement 398 non-null float64 3 horsepower 398 non-null object 4 weight 398 non-null float64 5 acceleration 398 non-null float64 6 model year 398 non-null int64 7 origin 398 non-null int64 8 name 398 non-null object dtypes: float64(4), int64(3), object(2) memory usage: 28.1+ KB
In [ ]:
# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환
# 1. '?'을 np.nan으로 변경
# 2. 누락데이터 행을 삭제
# 3. 문자열을 실수형으로 변환
In [50]:
df['horsepower'].replace('?', np.nan, inplace=True)
In [51]:
df.isnull().sum()
Out[51]:
mpg 0 cylinders 0 displacement 0 horsepower 6 weight 0 acceleration 0 model year 0 origin 0 name 0 dtype: int64
In [52]:
df.dropna(subset=['horsepower'], inplace = True)
In [53]:
df.count()
Out[53]:
mpg 392 cylinders 392 displacement 392 horsepower 392 weight 392 acceleration 392 model year 392 origin 392 name 392 dtype: int64
In [54]:
df['horsepower'] = df['horsepower'].astype('float64')
In [55]:
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cylinders 392 non-null int64 2 displacement 392 non-null float64 3 horsepower 392 non-null float64 4 weight 392 non-null float64 5 acceleration 392 non-null float64 6 model year 392 non-null int64 7 origin 392 non-null int64 8 name 392 non-null object dtypes: float64(5), int64(3), object(1) memory usage: 30.6+ KB
In [44]:
df.describe()
Out[44]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | |
---|---|---|---|---|---|---|---|---|
count | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 | 392.000000 |
mean | 23.445918 | 5.471939 | 194.411990 | 104.469388 | 2977.584184 | 15.541327 | 75.979592 | 1.576531 |
std | 7.805007 | 1.705783 | 104.644004 | 38.491160 | 849.402560 | 2.758864 | 3.683737 | 0.805518 |
min | 9.000000 | 3.000000 | 68.000000 | 46.000000 | 1613.000000 | 8.000000 | 70.000000 | 1.000000 |
25% | 17.000000 | 4.000000 | 105.000000 | 75.000000 | 2225.250000 | 13.775000 | 73.000000 | 1.000000 |
50% | 22.750000 | 4.000000 | 151.000000 | 93.500000 | 2803.500000 | 15.500000 | 76.000000 | 1.000000 |
75% | 29.000000 | 8.000000 | 275.750000 | 126.000000 | 3614.750000 | 17.025000 | 79.000000 | 2.000000 |
max | 46.600000 | 8.000000 | 455.000000 | 230.000000 | 5140.000000 | 24.800000 | 82.000000 | 3.000000 |
In [ ]:
# 'horsepower' 열에 대해 '저출력', '보통출력', '고출력' 으로 구간을 나누어 새로운 열 만들기 -> df['hp_bin']
- pd.cut 옵션¶
- include_lowest = True : 가장 작은 구간의 최솟값도 포함 (] -> []
In [56]:
cats = pd.cut(df['horsepower'], 3, labels=['저출력', '보통출력', '고출력'])
cats
Out[56]:
0 보통출력 1 보통출력 2 보통출력 3 보통출력 4 보통출력 ... 393 저출력 394 저출력 395 저출력 396 저출력 397 저출력 Name: horsepower, Length: 392, dtype: category Categories (3, object): ['저출력' < '보통출력' < '고출력']
In [57]:
df['hp_bin'] = cats
In [45]:
# 참고
count, bin_divider = np.histogram(df['horsepower'], bins=3)
In [46]:
count
Out[46]:
array([257, 103, 32], dtype=int64)
In [47]:
bin_divider
Out[47]:
array([ 46. , 107.33333333, 168.66666667, 230. ])
In [58]:
df
Out[58]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | hp_bin | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | 1 | chevrolet chevelle malibu | 보통출력 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | 1 | buick skylark 320 | 보통출력 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436.0 | 11.0 | 70 | 1 | plymouth satellite | 보통출력 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433.0 | 12.0 | 70 | 1 | amc rebel sst | 보통출력 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449.0 | 10.5 | 70 | 1 | ford torino | 보통출력 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790.0 | 15.6 | 82 | 1 | ford mustang gl | 저출력 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130.0 | 24.6 | 82 | 2 | vw pickup | 저출력 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295.0 | 11.6 | 82 | 1 | dodge rampage | 저출력 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625.0 | 18.6 | 82 | 1 | ford ranger | 저출력 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720.0 | 19.4 | 82 | 1 | chevy s-10 | 저출력 |
392 rows × 10 columns
In [59]:
# hp_bin 열에 대해 빈도수 확인하기
pd.value_counts(df['hp_bin'])
Out[59]:
저출력 257 보통출력 103 고출력 32 Name: hp_bin, dtype: int64
- 더미 변수
In [67]:
dummies = pd.get_dummies(df['hp_bin'])
df_dummies = df.iloc[:, :-1].join(dummies)
df_dummies
Out[67]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | 저출력 | 보통출력 | 고출력 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | 1 | chevrolet chevelle malibu | 0 | 1 | 0 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | 1 | buick skylark 320 | 0 | 1 | 0 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436.0 | 11.0 | 70 | 1 | plymouth satellite | 0 | 1 | 0 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433.0 | 12.0 | 70 | 1 | amc rebel sst | 0 | 1 | 0 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449.0 | 10.5 | 70 | 1 | ford torino | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790.0 | 15.6 | 82 | 1 | ford mustang gl | 1 | 0 | 0 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130.0 | 24.6 | 82 | 2 | vw pickup | 1 | 0 | 0 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295.0 | 11.6 | 82 | 1 | dodge rampage | 1 | 0 | 0 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625.0 | 18.6 | 82 | 1 | ford ranger | 1 | 0 | 0 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720.0 | 19.4 | 82 | 1 | chevy s-10 | 1 | 0 | 0 |
392 rows × 12 columns
- 시각화
In [4]:
import pandas as pd
df = pd.read_csv('./auto-mpg.csv', header=None)
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model year','origin','name']
In [5]:
df["mpg"]
Out[5]:
0 18.0 1 15.0 2 18.0 3 16.0 4 17.0 ... 393 27.0 394 44.0 395 32.0 396 28.0 397 31.0 Name: mpg, Length: 398, dtype: float64
In [6]:
df["mpg"].min(), df["mpg"].max()
Out[6]:
(9.0, 46.6)
In [7]:
df["mpg"].plot(kind="hist", bins=10)
Out[7]:
<AxesSubplot:ylabel='Frequency'>
In [8]:
df["mpg"].plot(kind="hist", bins=20, color="coral", figsize=(8, 4))
Out[8]:
<AxesSubplot:ylabel='Frequency'>
In [9]:
import matplotlib.pyplot as plt
In [10]:
plt.style.available
Out[10]:
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
- 스타일 설정¶
In [11]:
plt.style.use('ggplot')
In [12]:
df["mpg"].plot(kind="hist", bins=20, color="coral", figsize=(8, 4))
Out[12]:
<AxesSubplot:ylabel='Frequency'>
In [13]:
df.plot(kind="scatter", x='weight', y='mpg', c='skyblue', s=100, alpha=0.3)
Out[13]:
<AxesSubplot:xlabel='weight', ylabel='mpg'>
In [14]:
df.head()
Out[14]:
mpg | cylinders | displacement | horsepower | weight | acceleration | model year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504.0 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693.0 | 11.5 | 70 | 1 | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436.0 | 11.0 | 70 | 1 | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433.0 | 12.0 | 70 | 1 | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449.0 | 10.5 | 70 | 1 | ford torino |
In [15]:
df['cylinders']
Out[15]:
0 8 1 8 2 8 3 8 4 8 .. 393 4 394 4 395 4 396 4 397 4 Name: cylinders, Length: 398, dtype: int64
In [16]:
cylinder_size = df.cylinders/df.cylinders.max() * 300
In [17]:
df.plot(kind="scatter", x="weight", y="mpg", c="skyblue", s=cylinder_size, alpha=0.3)
Out[17]:
<AxesSubplot:xlabel='weight', ylabel='mpg'>
- 색이 진할수록 cylinder큰 값¶
In [18]:
df.plot(kind="scatter", x="weight", y="mpg", c=cylinder_size, s=100, alpha=0.3, cmap="binary")
Out[18]:
<AxesSubplot:xlabel='weight', ylabel='mpg'>
In [19]:
#origin : 제조국 1(USA), 2(EU), 3(JPN)
In [20]:
df['origin'].value_counts()
Out[20]:
1 249 3 79 2 70 Name: origin, dtype: int64
In [21]:
df.groupby("origin").size()
Out[21]:
origin 1 249 2 70 3 79 dtype: int64
In [22]:
df_origin = pd.DataFrame()
df_origin["count"] = df.groupby("origin").size()
In [23]:
df_origin["count"]
Out[23]:
origin 1 249 2 70 3 79 Name: count, dtype: int64
In [24]:
df_origin["count"].plot(kind="pie")
Out[24]:
<AxesSubplot:ylabel='count'>
In [25]:
df_origin["count"].plot(kind="pie", figsize=(7, 5))
Out[25]:
<AxesSubplot:ylabel='count'>
- startangle : 시작 각, default = 0 -> 동쪽, 각 커질수록 반시계방향으로 돌아감¶
In [26]:
df_origin.index = ["USA", "EU", "JAPAN"]
df_origin["count"].plot(kind="pie",
figsize=(10, 5),
startangle=10,
colors=["chocolate", "bisque", "cadetblue"],
autopct = "%1.1f%%")
plt.title("Mode Origin", size=20)
plt.axis("equal")
plt.legend(labels=df_origin.index, loc="best")
plt.show()
In [27]:
df['origin'] == 1
Out[27]:
0 True 1 True 2 True 3 True 4 True ... 393 True 394 False 395 True 396 True 397 True Name: origin, Length: 398, dtype: bool
In [28]:
df[df['origin'] == 1]['mpg'].plot(kind='box')
Out[28]:
<AxesSubplot:>
- fig : 도화지 역할¶
In [42]:
fig = plt.figure(figsize=(15, 5))
# 도화지를 1x3으로 나눔
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)
ax1.boxplot(x=df[df['origin'] == 1]['mpg'])
ax2.boxplot(x=df[df['origin'] == 2]['mpg'])
ax3.boxplot(x=df[df['origin'] == 3]['mpg'])
plt.show()