playdata
Pandas(0727_day3)
_JAEJAE_
2021. 7. 27. 17:08
In [2]:
import pandas as pd
In [1]:
!type examples\csv_mindex.csv
key1,key2,value1,value2 one,a,1,2 one,b,3,4 one,c,5,6 one,d,7,8 two,a,9,10 two,b,11,12 two,c,13,14 two,d,15,16
- 인덱스를 계층적으로 하기(index_col에 여러 개 지정)¶
In [4]:
parsed = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])
parsed
Out[4]:
value1 | value2 | ||
---|---|---|---|
key1 | key2 | ||
one | a | 1 | 2 |
b | 3 | 4 | |
c | 5 | 6 | |
d | 7 | 8 | |
two | a | 9 | 10 |
b | 11 | 12 | |
c | 13 | 14 | |
d | 15 | 16 |
In [5]:
!type examples\ex3.txt
A B C aaa -0.264438 -1.026059 -0.619500 bbb 0.927272 0.302904 -0.032399 ccc -0.264273 -0.386314 -0.217601 ddd -0.871858 -0.348382 1.100491
In [6]:
list(open('examples/ex3.txt'))
Out[6]:
[' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
- csv 파일아니면 read_table로 읽으면 됨¶
In [7]:
pd.read_table('examples/ex3.txt', sep="\s+")
Out[7]:
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
- 원하는 로우 생략하기¶
In [8]:
!type examples\ex4.csv
# hey! a,b,c,d,message # just wanted to make things more difficult for you # who reads CSV files with computers, anyway? 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
In [9]:
pd.read_csv('examples\ex4.csv', skiprows=[0, 2, 3])
Out[9]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
In [10]:
!type examples\ex5.csv
something,a,b,c,d,message one,1,2,3,4,NA two,5,6,,8,world three,9,10,11,12,foo
- NaN 데이터에 값 채우기¶
In [12]:
result = pd.read_csv('examples/ex5.csv')
result
Out[12]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
In [13]:
pd.isnull(result)
Out[13]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | False | False | False | False | False | True |
1 | False | False | False | True | False | False |
2 | False | False | False | False | False | False |
In [14]:
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result
Out[14]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
- 원하는 값 NaN으로 채우기(딕셔너리로 주면 됨)¶
In [15]:
sentinels = {'message':['foo'], 'something':['two']}
result = pd.read_csv('examples/ex5.csv', na_values=sentinels)
result
Out[15]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | NaN | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | NaN |
1. 텍스트 파일 조금씩 읽어오기¶
- pandas의 설정을 10개의 데이터만 출력하도록 변경¶
In [16]:
pd.options.display.max_rows = 10
In [17]:
result = pd.read_csv('examples/ex6.csv')
result
Out[17]:
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
... | ... | ... | ... | ... | ... |
9995 | 2.311896 | -0.417070 | -1.409599 | -0.515821 | L |
9996 | -0.479893 | -0.650419 | 0.745152 | -0.646038 | E |
9997 | 0.523331 | 0.787112 | 0.486066 | 1.093156 | K |
9998 | -0.362559 | 0.598894 | -1.843201 | 0.887292 | G |
9999 | -0.096376 | -1.012999 | -0.657431 | -0.573315 | 0 |
10000 rows × 5 columns
- head() VS pd.read_csv(~, nrows=5)¶
- head()는 이미 데이터 다 불러오고 그중 5개만 출력하는 것
- nrows = 5는 전체 데이터 중 5개만 불러온 것
In [18]:
pd.read_csv('examples/ex6.csv', nrows=5)
Out[18]:
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
In [40]:
chunker = pd.read_csv('examples/ex6.csv', chunksize = 1000)
chunker
Out[40]:
<pandas.io.parsers.TextFileReader at 0x18c9f1383d0>
- 10번 돌면서 key의 개수 세기(NaN은 무시하기 위해 fill_value=0)¶
In [41]:
tot = pd.Series([], dtype='float64')
for piece in chunker:
tot = tot.add(piece['key'].value_counts(), fill_value = 0)
tot = tot.sort_values(ascending=False)
tot
Out[41]:
E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 ... 5 157.0 2 152.0 0 151.0 9 150.0 1 146.0 Length: 36, dtype: float64
In [35]:
tot.isnull().sum()
Out[35]:
36
2. 데이터를 텍스트 형식으로 기록하기¶
In [42]:
data = pd.read_csv('examples/ex5.csv')
data
Out[42]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
In [43]:
data.to_csv('examples/out.csv')
!type examples\out.csv
,something,a,b,c,d,message 0,one,1,2,3.0,4, 1,two,5,6,,8,world 2,three,9,10,11.0,12,foo
In [44]:
import sys
data.to_csv(sys.stdout, sep='|')
|something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
- NULL값도 써주기¶
In [45]:
data.to_csv(sys.stdout, na_rep='NULL')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NULL 1,two,5,6,NULL,8,world 2,three,9,10,11.0,12,foo
- 데이터만 뽑기(인덱스, 헤더 제외)¶
In [46]:
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
- 지정한 컬럼의 개수에 따라 나오는 데이터의 개수도 지정됨¶
In [47]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
a,b,c 1,2,3.0 5,6, 9,10,11.0
In [49]:
dates = pd.date_range('1/1/2000', periods=7)
dates
Out[49]:
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07'], dtype='datetime64[ns]', freq='D')
In [51]:
import numpy as np
ts = pd.Series(np.arange(7), index=dates)
ts
Out[51]:
2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 Freq: D, dtype: int32
- header = False를 써주지 않으면 자동으로 header 생성함¶
In [52]:
ts.to_csv('examples/tseries.csv')
In [54]:
!type examples\tseries.csv
,0 2000-01-01,0 2000-01-02,1 2000-01-03,2 2000-01-04,3 2000-01-05,4 2000-01-06,5 2000-01-07,6
In [55]:
ts.to_csv('examples/tseries.csv', header=False)
In [57]:
!type examples\tseries.csv
2000-01-01,0 2000-01-02,1 2000-01-03,2 2000-01-04,3 2000-01-05,4 2000-01-06,5 2000-01-07,6
- Excel 파일 읽기¶
In [58]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')
In [61]:
type(xlsx)
Out[61]:
pandas.io.excel._base.ExcelFile
In [60]:
frame = pd.read_excel(xlsx, 'Sheet1')
frame
Out[60]:
Unnamed: 0 | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 2 | 3 | 4 | hello |
1 | 1 | 5 | 6 | 7 | 8 | world |
2 | 2 | 9 | 10 | 11 | 12 | foo |
- Excel 파일 쓰기¶
1.¶
In [63]:
writer = pd.ExcelWriter('examples/ex2.xlsx')
In [65]:
type(writer)
Out[65]:
pandas.io.excel._xlsxwriter.XlsxWriter
In [67]:
frame.to_excel(writer, 'Sheet1')
writer.save()
2.¶
In [70]:
frame.to_excel('examples/ex2.xlsx')
데이터 정제 및 준비¶
1. 누락된 데이터 처리하기¶
In [71]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
Out[71]:
0 aardvark 1 artichoke 2 NaN 3 avocado dtype: object
In [72]:
string_data.isnull()
Out[72]:
0 False 1 False 2 True 3 False dtype: bool
In [73]:
string_data[0] = None
In [74]:
type(None)
Out[74]:
NoneType
In [75]:
string_data
Out[75]:
0 None 1 artichoke 2 NaN 3 avocado dtype: object
In [76]:
string_data.isnull()
Out[76]:
0 True 1 False 2 True 3 False dtype: bool
1.1 누락된 데이터 골라내기¶
In [80]:
from numpy import nan as NA
In [81]:
data = pd.Series([1, NA, 3.5, NA, 7])
data
Out[81]:
0 1.0 1 NaN 2 3.5 3 NaN 4 7.0 dtype: float64
- dropna() : NaN값 삭제¶
In [83]:
data.dropna()
Out[83]:
0 1.0 2 3.5 4 7.0 dtype: float64
In [84]:
data.notnull()
Out[84]:
0 True 1 False 2 True 3 False 4 True dtype: bool
- dropna와 동일한 결과¶
In [85]:
data[data.notnull()]
Out[85]:
0 1.0 2 3.5 4 7.0 dtype: float64
In [86]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
In [87]:
data
Out[87]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
- dropna() 옵션¶
- axis = 0 : 행을 삭제
- axis = 1 : 열을 삭제
- how = 'any' : 하나라도 NaN이면 축 전체 삭제
- how = 'all' : 축에 있는 값 모두 NaN이면 축 전체 삭제
In [90]:
data.dropna(how='all')
Out[90]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
- 열추가¶
In [91]:
data[4] = NA
data
Out[91]:
0 | 1 | 2 | 4 | |
---|---|---|---|---|
0 | 1.0 | 6.5 | 3.0 | NaN |
1 | 1.0 | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 | NaN |
- 모든 값이 NaN인 열 삭제¶
In [92]:
data.dropna(axis=1, how='all')
Out[92]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
In [118]:
df = pd.DataFrame(np.random.randn(7, 3))
df
Out[118]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | -0.954640 | -1.382698 |
1 | 0.180910 | -0.613711 | 0.315803 |
2 | -0.326607 | -0.420843 | -0.578552 |
3 | 0.492695 | -0.603105 | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [119]:
df.iloc[:4, 1] = NA
df
Out[119]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | NaN | -1.382698 |
1 | 0.180910 | NaN | 0.315803 |
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [120]:
df.iloc[:2, 2] = NA
df
Out[120]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | NaN | NaN |
1 | 0.180910 | NaN | NaN |
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [121]:
df.dropna()
Out[121]:
0 | 1 | 2 | |
---|---|---|---|
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
- thresh = 2 : NaN이 2건 이상일 때만 삭제해라¶
In [122]:
df.dropna(thresh=2)
Out[122]:
0 | 1 | 2 | |
---|---|---|---|
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [123]:
df.dropna(thresh=2, how='any')
Out[123]:
0 | 1 | 2 | |
---|---|---|---|
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [124]:
df.dropna(axis=1, thresh=2)
Out[124]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | NaN | NaN |
1 | 0.180910 | NaN | NaN |
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
1.2. 결측치 채우기¶
In [125]:
df
Out[125]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | NaN | NaN |
1 | 0.180910 | NaN | NaN |
2 | -0.326607 | NaN | -0.578552 |
3 | 0.492695 | NaN | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [126]:
df.fillna(0)
Out[126]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | 0.000000 | 0.000000 |
1 | 0.180910 | 0.000000 | 0.000000 |
2 | -0.326607 | 0.000000 | -0.578552 |
3 | 0.492695 | 0.000000 | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [127]:
df.fillna({1:0.5, 2:0})
Out[127]:
0 | 1 | 2 | |
---|---|---|---|
0 | 2.487801 | 0.500000 | 0.000000 |
1 | 0.180910 | 0.500000 | 0.000000 |
2 | -0.326607 | 0.500000 | -0.578552 |
3 | 0.492695 | 0.500000 | 2.190383 |
4 | 0.849041 | 0.148989 | 0.643565 |
5 | 0.809561 | 0.129650 | -0.711411 |
6 | 0.090936 | 2.001899 | -1.270992 |
In [128]:
df = pd.DataFrame(np.random.randn(6, 3))
df
Out[128]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | 1.607886 | -0.077601 |
3 | 1.247522 | 1.986249 | 0.527880 |
4 | 0.394341 | -0.247451 | 1.464885 |
5 | 0.699713 | -0.038816 | -1.095486 |
In [129]:
df.iloc[2:, 1] = NA
df
Out[129]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | NaN | -0.077601 |
3 | 1.247522 | NaN | 0.527880 |
4 | 0.394341 | NaN | 1.464885 |
5 | 0.699713 | NaN | -1.095486 |
In [130]:
df.iloc[4:, 2] = NA
df
Out[130]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | NaN | -0.077601 |
3 | 1.247522 | NaN | 0.527880 |
4 | 0.394341 | NaN | NaN |
5 | 0.699713 | NaN | NaN |
- method = 'ffill' : 행 축(axis=0)을 따라 이전 값과 동일한 값으로 지정¶
In [131]:
df.fillna(method='ffill')
Out[131]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | 1.011069 | -0.077601 |
3 | 1.247522 | 1.011069 | 0.527880 |
4 | 0.394341 | 1.011069 | 0.527880 |
5 | 0.699713 | 1.011069 | 0.527880 |
In [132]:
df.fillna(method='ffill', axis=1)
Out[132]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | -0.532322 | -0.077601 |
3 | 1.247522 | 1.247522 | 0.527880 |
4 | 0.394341 | 0.394341 | 0.394341 |
5 | 0.699713 | 0.699713 | 0.699713 |
In [133]:
df
Out[133]:
0 | 1 | 2 | |
---|---|---|---|
0 | -0.269502 | 0.495406 | -0.814128 |
1 | 0.071575 | 1.011069 | 0.852014 |
2 | -0.532322 | NaN | -0.077601 |
3 | 1.247522 | NaN | 0.527880 |
4 | 0.394341 | NaN | NaN |
5 | 0.699713 | NaN | NaN |
In [134]:
data = pd.Series([1., NA, 3.5, NA, 7])
data
Out[134]:
0 1.0 1 NaN 2 3.5 3 NaN 4 7.0 dtype: float64
- 평균값으로 NaN 채우기¶
In [136]:
data.fillna(data.mean())
Out[136]:
0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64
2. 데이터 변형¶
2.1 중복 제거하기¶
In [137]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
Out[137]:
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
6 | two | 4 |
In [138]:
data.duplicated()
Out[138]:
0 False 1 False 2 False 3 False 4 False 5 False 6 True dtype: bool
In [139]:
data.drop_duplicates()
Out[139]:
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
In [140]:
data['v1'] = range(7)
data
Out[140]:
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
5 | two | 4 | 5 |
6 | two | 4 | 6 |
- 특정 열의 중복값 삭제¶
- keep = 'first' | 'last' : 앞에 나온 걸 남길지, 뒤에 나온 걸 남길지
In [142]:
data.drop_duplicates(['k1'])
Out[142]:
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
In [143]:
data.drop_duplicates(['k1'], keep='last')
Out[143]:
k1 | k2 | v1 | |
---|---|---|---|
4 | one | 3 | 4 |
6 | two | 4 | 6 |
In [145]:
data
Out[145]:
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
5 | two | 4 | 5 |
6 | two | 4 | 6 |
In [144]:
data.drop_duplicates(['k1', 'k2'])
Out[144]:
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
5 | two | 4 | 5 |
In [146]:
data.drop_duplicates(['k1', 'k2'], keep='last')
Out[146]:
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
6 | two | 4 | 6 |
2.2 함수나 매핑을 이용해서 변형하기¶
In [147]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[147]:
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
In [148]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
In [151]:
lowercased = data['food'].str.lower()
lowercased
Out[151]:
0 bacon 1 pulled pork 2 bacon 3 pastrami 4 corned beef 5 bacon 6 pastrami 7 honey ham 8 nova lox Name: food, dtype: object
- map함수에 collection을 매핑해서 변환¶
In [153]:
data['animal'] = lowercased.map(meat_to_animal)
data
Out[153]:
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
- map함수에 사용자 정의함수 적용해서 변환¶
In [159]:
def f(x):
return meat_to_animal[x.lower()]
In [160]:
data['food'].map(f)
Out[160]:
0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object
In [161]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
Out[161]:
0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object
2.3 값 치환하기¶
In [163]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
Out[163]:
0 1.0 1 -999.0 2 2.0 3 -999.0 4 -1000.0 5 3.0 dtype: float64
In [164]:
data.replace(-999, np.nan)
Out[164]:
0 1.0 1 NaN 2 2.0 3 NaN 4 -1000.0 5 3.0 dtype: float64
In [165]:
data.replace([-999, -1000], np.nan)
Out[165]:
0 1.0 1 NaN 2 2.0 3 NaN 4 NaN 5 3.0 dtype: float64
In [166]:
data.replace([-999, -1000], [np.nan, 0])
Out[166]:
0 1.0 1 NaN 2 2.0 3 NaN 4 0.0 5 3.0 dtype: float64
In [167]:
data.replace({-999: np.nan, -1000:0})
Out[167]:
0 1.0 1 NaN 2 2.0 3 NaN 4 0.0 5 3.0 dtype: float64
2.4 축 색인 이름 바꾸기¶
In [168]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
Out[168]:
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
In [169]:
data.index
Out[169]:
Index(['Ohio', 'Colorado', 'New York'], dtype='object')
In [170]:
transform = lambda x : x.upper()
In [171]:
data.index.map(transform)
Out[171]:
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
In [173]:
transform = lambda x : x[:4].upper()
In [174]:
data.index.map(transform)
Out[174]:
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
In [175]:
data
Out[175]:
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
- 인덱스, 컬럼명 모두 대문자로 바꾸기¶
- str.upper
In [176]:
data.rename(index = str.upper, columns=str.upper)
Out[176]:
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
- 컬럼명 첫글자만 대문자로 바꾸기¶
- str.title
In [178]:
data.rename(index = str.upper, columns=str.title)
Out[178]:
One | Two | Three | Four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
- rename은 부분적으로 값을 바꿔줄수도 있음¶
In [179]:
data.rename(index={'Ohio':'INDIANA'}, inplace=True)
data
Out[179]:
one | two | three | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
2.5 개별화와 양자화¶
In [180]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
In [181]:
bins = [18, 25, 35, 60, 100]
- 연속형 데이터 그룹으로 나누기¶
- right=False 옵션을 주면 왼쪽 괄호가 [, 오른쪽이 )로 바뀜
In [183]:
cats = pd.cut(ages, bins)
cats
Out[183]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [184]:
type(cats)
Out[184]:
pandas.core.arrays.categorical.Categorical
- codes : 각 구간의 순서를 0부터 지정했을 때의 값을 반환하는 속성¶
In [185]:
cats.codes
Out[185]:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
In [187]:
cats.categories
Out[187]:
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], closed='right', dtype='interval[int64]')
- value_counts : 각 구간에 속하는 데이터의 개수 반환¶
In [188]:
pd.value_counts(cats)
Out[188]:
(18, 25] 5 (25, 35] 3 (35, 60] 3 (60, 100] 1 dtype: int64
In [189]:
pd.cut(ages, bins, right=False)
Out[189]:
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)] Length: 12 Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
- 구간의 이름 지정하기¶
In [190]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
In [191]:
pd.cut(ages, bins, labels=group_names)
Out[191]:
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult'] Length: 12 Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
- cut 두번째 인자에 숫자를 넣으면 그 개수만큼 영역을 분할함¶
- precision=2 : 소수점 아래 2자리로 제한
- cut VS qcut¶
- pd.cut : 전체 데이터를 동일한 길이로 나누어서 범주를 만듦, 데이터의 분산에 따라 각각의 범주마다 데이터 수가 다르게 나뉨
- pd.qcut: 전체 데이터를 동일한 개수로 나누어서 범주를 만듦, 범주마다 데이터 수가 같음
In [194]:
data = np.random.rand(20)
c = pd.cut(data, 4, precision=2)
In [196]:
pd.value_counts(c)
Out[196]:
(0.0033, 0.18] 7 (0.36, 0.54] 6 (0.18, 0.36] 5 (0.54, 0.72] 2 dtype: int64
In [201]:
data = np.random.rand(1000)
cats = pd.qcut(data, 4, precision=2)
cats
Out[201]:
[(0.51, 0.75], (0.75, 1.0], (-0.0088, 0.25], (0.75, 1.0], (0.75, 1.0], ..., (-0.0088, 0.25], (0.25, 0.51], (0.25, 0.51], (0.51, 0.75], (0.75, 1.0]] Length: 1000 Categories (4, interval[float64]): [(-0.0088, 0.25] < (0.25, 0.51] < (0.51, 0.75] < (0.75, 1.0]]
In [202]:
pd.value_counts(cats)
Out[202]:
(-0.0088, 0.25] 250 (0.25, 0.51] 250 (0.51, 0.75] 250 (0.75, 1.0] 250 dtype: int64
- 변위치(0~1)를 직접 지정해줄 수 있음¶
In [205]:
q = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])
q
Out[205]:
[(0.511, 0.897], (0.511, 0.897], (0.00019000000000000006, 0.0976], (0.897, 1.0], (0.511, 0.897], ..., (0.0976, 0.511], (0.0976, 0.511], (0.0976, 0.511], (0.511, 0.897], (0.897, 1.0]] Length: 1000 Categories (4, interval[float64]): [(0.00019000000000000006, 0.0976] < (0.0976, 0.511] < (0.511, 0.897] < (0.897, 1.0]]
In [206]:
pd.value_counts(q)
Out[206]:
(0.0976, 0.511] 400 (0.511, 0.897] 400 (0.00019000000000000006, 0.0976] 100 (0.897, 1.0] 100 dtype: int64
2.6 특잇값을 찾고 제외하기¶
In [229]:
data = pd.DataFrame(np.random.randn(1000, 4))
data
Out[229]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.824432 | 0.430941 | -0.561765 | -0.665038 |
1 | 0.066199 | -0.966803 | 0.135983 | 0.190991 |
2 | -0.646779 | -1.910689 | -0.410341 | -0.535656 |
3 | -1.150349 | 0.166510 | -0.728350 | -1.192499 |
4 | -2.309373 | 0.580938 | 0.715147 | 0.686918 |
... | ... | ... | ... | ... |
995 | -0.591075 | -1.861731 | -0.706999 | -1.097863 |
996 | 0.508796 | -1.166741 | -1.985351 | -0.001851 |
997 | -1.169001 | 1.304777 | -0.321725 | -0.231718 |
998 | -0.176866 | 0.768612 | -0.006473 | -1.747993 |
999 | -0.031311 | 0.268804 | 0.542087 | -1.322533 |
1000 rows × 4 columns
- describe(): 통계정보 확인하기¶
In [230]:
data.describe()
Out[230]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.055534 | 0.013611 | 0.012116 | -0.042056 |
std | 0.986343 | 0.999321 | 0.989407 | 1.010529 |
min | -3.124958 | -3.100540 | -2.987114 | -2.992869 |
25% | -0.668508 | -0.667180 | -0.681322 | -0.735616 |
50% | -0.031437 | -0.001257 | -0.011110 | -0.058144 |
75% | 0.590862 | 0.697791 | 0.712576 | 0.631394 |
max | 3.272081 | 3.527864 | 3.126377 | 3.401081 |
- np.abs(): 데이터 절댓값으로 바꾸기¶
In [222]:
np.abs(data)
Out[222]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.969496 | 1.037778 | 0.470784 | 0.456338 |
1 | 2.563728 | 0.983210 | 0.911133 | 0.782408 |
2 | 0.640341 | 0.232223 | 0.964221 | 1.285585 |
3 | 0.133974 | 0.477829 | 0.439181 | 0.987761 |
4 | 0.662473 | 1.301215 | 0.150583 | 0.798600 |
... | ... | ... | ... | ... |
995 | 1.223971 | 1.160064 | 0.959102 | 0.177906 |
996 | 2.013945 | 0.636599 | 1.504288 | 2.680148 |
997 | 0.357947 | 0.937710 | 0.748091 | 1.226983 |
998 | 0.824923 | 0.469431 | 1.053722 | 0.788416 |
999 | 0.676036 | 0.660155 | 1.197430 | 0.313380 |
1000 rows × 4 columns
- 절댓값이 3보다 큰 값이 행에 하나라도 있으면 해당 행 출력하기¶
In [223]:
data[(np.abs(data) > 3).any(1)]
Out[223]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
297 | -0.442713 | 3.545297 | 0.752514 | 0.701124 |
336 | -0.560589 | -0.648311 | 3.119265 | -0.761394 |
417 | 0.324457 | -0.236248 | 3.565767 | 0.098592 |
496 | 3.080955 | -0.079587 | 0.896459 | 0.641667 |
608 | 3.049659 | -1.015505 | -1.312415 | -0.026697 |
679 | 1.268774 | 1.247228 | 3.087625 | -0.000767 |
903 | 0.481978 | 1.025002 | 1.205544 | 3.335964 |
- np.sign(data): data값이 -이면 -1, +이면 1¶
In [224]:
data[np.abs(data) > 3] = np.sign(data) * 3
In [225]:
data.describe()
Out[225]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.027585 | -0.030212 | 0.009758 | 0.044914 |
std | 1.003980 | 0.978128 | 1.004125 | 0.995781 |
min | -2.994121 | -2.957423 | -2.673288 | -2.811389 |
25% | -0.687172 | -0.700384 | -0.663334 | -0.671602 |
50% | -0.015436 | -0.015811 | 0.004428 | 0.059982 |
75% | 0.642045 | 0.636839 | 0.705851 | 0.739858 |
max | 3.000000 | 3.000000 | 3.000000 | 3.000000 |
- 넘파이 clip으로 이상치 변경하기¶
In [231]:
new_data = data.values.clip(-3, 3)
new_data
Out[231]:
array([[ 0.8244318 , 0.43094057, -0.56176532, -0.6650382 ], [ 0.0661987 , -0.96680266, 0.13598302, 0.19099137], [-0.646779 , -1.91068895, -0.41034099, -0.53565601], ..., [-1.1690011 , 1.30477696, -0.32172535, -0.23171762], [-0.17686562, 0.76861185, -0.0064728 , -1.74799294], [-0.03131066, 0.26880385, 0.5420866 , -1.32253256]])
In [232]:
new_data.max()
Out[232]:
3.0
In [233]:
new_data.min()
Out[233]:
-3.0
2.7 치환과 임의 샘플링¶
In [234]:
df = pd.DataFrame(np.arange(20).reshape(5, 4))
In [235]:
df # 5행
Out[235]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
In [237]:
sampler = np.random.permutation(5)
sampler
Out[237]:
array([3, 4, 1, 0, 2])
In [238]:
df.iloc[sampler]
Out[238]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
1 | 4 | 5 | 6 | 7 |
0 | 0 | 1 | 2 | 3 |
2 | 8 | 9 | 10 | 11 |
In [239]:
df.take(sampler)
Out[239]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
1 | 4 | 5 | 6 | 7 |
0 | 0 | 1 | 2 | 3 |
2 | 8 | 9 | 10 | 11 |
- sample(n) : 지정한 갯수만큼 랜덤 샘플링¶
In [240]:
df.sample(n=3)
Out[240]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
1 | 4 | 5 | 6 | 7 |
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
In [242]:
choices = pd.Series([5, 7, -1, 6, 4])
choices
Out[242]:
0 5 1 7 2 -1 3 6 4 4 dtype: int64
In [243]:
choices.sample(n=3)
Out[243]:
2 -1 0 5 3 6 dtype: int64
- samle의 옵션¶
- replace=True: 전체 모집단보다 더 큰 수만큼 샘플링 가능, default는 False
In [244]:
choices.sample(n=10, replace=True)
Out[244]:
1 7 2 -1 1 7 2 -1 0 5 4 4 4 4 0 5 3 6 0 5 dtype: int64