[정리] pandas 사용하면서
pandas 사용하면서 개인적으로 기억하면 좋은 것들을 정리
Unix timestamp to datetime64
import pandas as pd
data['date'] # unix time stamp for example, must be numeric type
data['date'] = pd.to_datetime(data['date'], unit='s')
# alternative, no inplace option
data['date'] = data['date'].appy(pd.to_datetime, unit='s')
DtypeWarning: Columns (…..) have mixed type
컬럼이 mixed한 데이터 타입을 가지고 있어서 pandas가 타입을 추론할 수 없을 때 발생
e.g.) abc -> 1nc -> 123 -> 123(같은 컬럼)
==> 모든 컬럼을 스트링으로 읽어들인다.
# error!
data1 = pd.read_csv("./data/kickstarter/ks_projects1.csv")
# read in string type
data1 = pd.read_csv(
"./data/kickstarter/ks_projects1.csv", dtype=object)
단, NA에 대한 적절한 처리가 이루어져야 한다.
dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} Use str or object together with suitable na_values settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion.
drop the column
dropcolumns = ["Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16"]
data1 = data1.drop(dropcolumns, axis=1)
# or
data1 = data1.drop(columns=dropcolumns)
remove whitespace in column name(stripping)
data1.columns = data1.columns.str.strip()
interactive plots with ployly
import plotly.offline as py
import plotly.graph_objs as go
def piechart(temp, title):
labels = temp.index
sizes = (temp/ temp.sum()) * 100
trace = go.Pie(labels=labels, values=sizes, hoverinfo='label+percent')
layout = go.Layout(title=title)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
def barchart(temp, title):
labels = temp.index
sizes = (temp / temp.sum())*100
trace = go.Bar(x=labels, y = sizes)
layout = go.Layout(title=title)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
find row index by re pattern
pattern_c = "\d+"
filter_country = data1["country"].str.contains(pattern_c)
# filter_country -> list of Logical Values
convert categorical value(string value) to numeric value
data1["state"] = data1["state"].map({
'failed': 0,
'successful': 1,
'canceled': -1,
'suspended': -1,
'live': -1,
'undefined': -1
})
# 이런 방법도 있음
data1["currency"] = data1["currency"].astype("category").cat.codes
댓글남기기