import pandas as pd


df = pd.read_csv('./auto-mpg.csv', header=None)


df2 = pd.read_csv('./auto-mpg_o.csv', sep="\s+", header=None)
df2.head()


df3 = pd.read_table('./auto-mpg.data-original.txt', sep="\s+", header=None)
df3.head()


#mpg : 연비
#cylinders : 실린더수 
#displacement : 배기량
#horsepower: 출력
#weight : 차중
#acceleration : 가속능력
#model year : 출시년도
#origin : 제조국 1(USA), 2(EU), 3(JPN)
#name : 모델명


df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']


df.head(2)


df.tail()


df.shape

(398, 9)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
name             object
dtype: object


df.mpg.dtypes

dtype('float64')


df.describe()


# include='all' 옵션 사용


df.describe(include='all')


# name 열의 기술 통계 정보 확인


df.name.describe()

count            398
unique           305
top       ford pinto
freq               6
Name: name, dtype: object


# mpg 열의 기술 통계 정보 확인


df.mpg.describe()

count    398.000000
mean      23.514573
std        7.815984
min        9.000000
25%       17.500000
50%       23.000000
75%       29.000000
max       46.600000
Name: mpg, dtype: float64


df.count()

mpg             398
cylinders       398
displacement    398
horsepower      398
weight          398
acceleration    398
model year      398
origin          398
name            398
dtype: int64


type(df.count())

pandas.core.series.Series


df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64


#df의 특정 열('origin')의 histogram 그리기 - series.hist() 함수


df['origin'].plot(kind='hist', grid=True)


df['origin'].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x16106c04970>


df.mean()

mpg               23.514573
cylinders          5.454774
displacement     193.425879
weight          2970.424623
acceleration      15.568090
model year        76.010050
origin             1.572864
dtype: float64


df['mpg'].mean()

23.514572864321615


df.mpg.mean()

23.514572864321615


df[['mpg','weight']].mean()

mpg         23.514573
weight    2970.424623
dtype: float64


df.median()

mpg               23.0
cylinders          4.0
displacement     148.5
weight          2803.5
acceleration      15.5
model year        76.0
origin             1.0
dtype: float64


df['mpg'].median()

23.0


df.max()

mpg                         46.6
cylinders                      8
displacement                 455
horsepower                     ?
weight                      5140
acceleration                24.8
model year                    82
origin                         3
name            vw rabbit custom
dtype: object


df['mpg'].max()

46.6


df.min()

mpg                                   9
cylinders                             3
displacement                         68
horsepower                        100.0
weight                             1613
acceleration                          8
model year                           70
origin                                1
name            amc ambassador brougham
dtype: object


df['mpg'].min()

9.0


df.std()

mpg               7.815984
cylinders         1.701004
displacement    104.269838
weight          846.841774
acceleration      2.757689
model year        3.697627
origin            0.802055
dtype: float64


df['mpg'].std()

7.815984312565782


df.corr()


df[['mpg','weight']].corr()


df.plot(kind="scatter", x='weight', y='mpg')

<matplotlib.axes._subplots.AxesSubplot at 0x1610738ca30>


import pandas as pd
import numpy as np

df = pd.read_csv('./auto-mpg.csv', header=None)

# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환
# 1. '?'을 np.nan으로 변경
# 2. 누락데이터 행을 삭제
# 3. 문자열을 실수형으로 변환


df['horsepower'].replace('?', np.nan, inplace=True)


df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
name            0
dtype: int64


df.dropna(subset=['horsepower'], inplace = True)


df.count()

mpg             392
cylinders       392
displacement    392
horsepower      392
weight          392
acceleration    392
model year      392
origin          392
name            392
dtype: int64


df['horsepower'] = df['horsepower'].astype('float64')


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 30.6+ KB


df.describe()


# 'horsepower' 열에 대해 '저출력', '보통출력', '고출력' 으로 구간을 나누어 새로운 열 만들기 -> df['hp_bin']


cats = pd.cut(df['horsepower'], 3, labels=['저출력', '보통출력', '고출력'])
cats

0      보통출력
1      보통출력
2      보통출력
3      보통출력
4      보통출력
       ... 
393     저출력
394     저출력
395     저출력
396     저출력
397     저출력
Name: horsepower, Length: 392, dtype: category
Categories (3, object): ['저출력' < '보통출력' < '고출력']


df['hp_bin'] = cats


# 참고

count, bin_divider = np.histogram(df['horsepower'], bins=3)


count

array([257, 103,  32], dtype=int64)


bin_divider

array([ 46.        , 107.33333333, 168.66666667, 230.        ])

df


# hp_bin 열에 대해 빈도수 확인하기
pd.value_counts(df['hp_bin'])

저출력     257
보통출력    103
고출력      32
Name: hp_bin, dtype: int64


dummies = pd.get_dummies(df['hp_bin'])
df_dummies = df.iloc[:, :-1].join(dummies)
df_dummies


import pandas as pd
df = pd.read_csv('./auto-mpg.csv', header=None)
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']


df["mpg"]

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64


df["mpg"].min(), df["mpg"].max()

(9.0, 46.6)


df["mpg"].plot(kind="hist", bins=10)

<AxesSubplot:ylabel='Frequency'>


df["mpg"].plot(kind="hist", bins=20, color="coral", figsize=(8, 4))

<AxesSubplot:ylabel='Frequency'>


import matplotlib.pyplot as plt


plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10']


plt.style.use('ggplot')


df["mpg"].plot(kind="hist", bins=20, color="coral", figsize=(8, 4))

<AxesSubplot:ylabel='Frequency'>


df.plot(kind="scatter", x='weight', y='mpg', c='skyblue', s=100, alpha=0.3)

<AxesSubplot:xlabel='weight', ylabel='mpg'>


df.head()


df['cylinders']

0      8
1      8
2      8
3      8
4      8
      ..
393    4
394    4
395    4
396    4
397    4
Name: cylinders, Length: 398, dtype: int64


cylinder_size = df.cylinders/df.cylinders.max() * 300


df.plot(kind="scatter", x="weight", y="mpg", c="skyblue", s=cylinder_size, alpha=0.3)

<AxesSubplot:xlabel='weight', ylabel='mpg'>


df.plot(kind="scatter", x="weight", y="mpg", c=cylinder_size, s=100, alpha=0.3, cmap="binary")

<AxesSubplot:xlabel='weight', ylabel='mpg'>


#origin : 제조국 1(USA), 2(EU), 3(JPN)


df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64


df.groupby("origin").size()

origin
1    249
2     70
3     79
dtype: int64


df_origin = pd.DataFrame()
df_origin["count"] = df.groupby("origin").size()


df_origin["count"]

origin
1    249
2     70
3     79
Name: count, dtype: int64


df_origin["count"].plot(kind="pie")

<AxesSubplot:ylabel='count'>


df_origin["count"].plot(kind="pie", figsize=(7, 5))

<AxesSubplot:ylabel='count'>


df_origin.index = ["USA", "EU", "JAPAN"]
df_origin["count"].plot(kind="pie",
                        figsize=(10, 5),
                        startangle=10,
                        colors=["chocolate", "bisque", "cadetblue"],
                        autopct = "%1.1f%%")

plt.title("Mode Origin", size=20)
plt.axis("equal")
plt.legend(labels=df_origin.index, loc="best")
plt.show()


df['origin'] == 1

0       True
1       True
2       True
3       True
4       True
       ...  
393     True
394    False
395     True
396     True
397     True
Name: origin, Length: 398, dtype: bool


df[df['origin'] == 1]['mpg'].plot(kind='box')

<AxesSubplot:>


fig = plt.figure(figsize=(15, 5))

# 도화지를 1x3으로 나눔 
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)

ax1.boxplot(x=df[df['origin'] == 1]['mpg'])
ax2.boxplot(x=df[df['origin'] == 2]['mpg'])
ax3.boxplot(x=df[df['origin'] == 3]['mpg'])

plt.show()

	0	1	2	3	4	5	6	7	8
0	18.0	8.0	307.0	130.0	3504.0	12.0	70.0	1.0	chevrolet chevelle malibu
1	15.0	8.0	350.0	165.0	3693.0	11.5	70.0	1.0	buick skylark 320
2	18.0	8.0	318.0	150.0	3436.0	11.0	70.0	1.0	plymouth satellite
3	16.0	8.0	304.0	150.0	3433.0	12.0	70.0	1.0	amc rebel sst
4	17.0	8.0	302.0	140.0	3449.0	10.5	70.0	1.0	ford torino

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	origin	name
393	27.0	4	140.0	86.00	2790.0	15.6	82	1	ford mustang gl
394	44.0	4	97.0	52.00	2130.0	24.6	82	2	vw pickup
395	32.0	4	135.0	84.00	2295.0	11.6	82	1	dodge rampage
396	28.0	4	120.0	79.00	2625.0	18.6	82	1	ford ranger
397	31.0	4	119.0	82.00	2720.0	19.4	82	1	chevy s-10

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	origin	name
count	398.000000	398.000000	398.000000	398	398.000000	398.000000	398.000000	398.000000	398
unique	NaN	NaN	NaN	94	NaN	NaN	NaN	NaN	305
top	NaN	NaN	NaN	150.0	NaN	NaN	NaN	NaN	ford pinto
freq	NaN	NaN	NaN	22	NaN	NaN	NaN	NaN	6
mean	23.514573	5.454774	193.425879	NaN	2970.424623	15.568090	76.010050	1.572864	NaN
std	7.815984	1.701004	104.269838	NaN	846.841774	2.757689	3.697627	0.802055	NaN
min	9.000000	3.000000	68.000000	NaN	1613.000000	8.000000	70.000000	1.000000	NaN
25%	17.500000	4.000000	104.250000	NaN	2223.750000	13.825000	73.000000	1.000000	NaN
50%	23.000000	4.000000	148.500000	NaN	2803.500000	15.500000	76.000000	1.000000	NaN
75%	29.000000	8.000000	262.000000	NaN	3608.000000	17.175000	79.000000	2.000000	NaN
max	46.600000	8.000000	455.000000	NaN	5140.000000	24.800000	82.000000	3.000000	NaN

	mpg	cylinders	displacement	weight	acceleration	model year	origin
mpg	1.000000	-0.775396	-0.804203	-0.831741	0.420289	0.579267	0.563450
cylinders	-0.775396	1.000000	0.950721	0.896017	-0.505419	-0.348746	-0.562543
displacement	-0.804203	0.950721	1.000000	0.932824	-0.543684	-0.370164	-0.609409
weight	-0.831741	0.896017	0.932824	1.000000	-0.417457	-0.306564	-0.581024
acceleration	0.420289	-0.505419	-0.543684	-0.417457	1.000000	0.288137	0.205873
model year	0.579267	-0.348746	-0.370164	-0.306564	0.288137	1.000000	0.180662
origin	0.563450	-0.562543	-0.609409	-0.581024	0.205873	0.180662	1.000000

	mpg	cylinders	displacement	horsepower	weight	acceleration	model year	origin
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000

Pandas(0728_day4) - 실습_자동차 연비 분석 (데이터 전처리 및 시각화)

자동차 연비 데이터셋¶

데이터 살펴보기¶

데이터 개수 확인¶

통계 함수¶

데이터 전처리¶

- pd.cut 옵션¶

- 스타일 설정¶

- 색이 진할수록 cylinder큰 값¶

- startangle : 시작 각, default = 0 -> 동쪽, 각 커질수록 반시계방향으로 돌아감¶

- fig : 도화지 역할¶