标题 |
创建时间 |
发布时间 |
最后一次修改 |
备注 |
Pandas 常见问题速查 |
2020.08.04 |
2020.12.21 |
2021.06.09 |
/ |
一个 dataframe 重复多次,复制内容 n 遍
n = 3
df_repeated = pd.concat([df1]*n, ignore_index=True)
日期分箱
date_bins = pd.date_range('2000-01-01', freq='3D', periods=6) #3天一个步长,总计6个分箱
pd.cut(df.Date,bins=date_bins)
pandas 去重求差集
df_a = pd.DataFrame(aa)
df_b = pd.DataFrame(bb)
df_a = df_a.append(df_b)
df_a.drop_duplicates(subset=['col_1','col_2'],keep=False)
特征相关性热力图绘制
feature_df.corr().style.background_gradient(cmap='coolwarm')
pandas 不折叠行与列
pd.set_option('display.max_rows', 500) #最大行数
pd.set_option('display.max_columns', 500) #最大列数
pandas rename 列名
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.rename(columns={"A": "a", "B": "c"})
强制关闭 copy warning
# source: https://zhuanlan.zhihu.com/p/41202576 链接中有其他更优解决方案
pd.set_option('mode.chained_assignment', None)
控制 pandas to csv 输出结果顺序
# source: https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# 直接排序法
df = df.sort_index(axis=1)
# 重写col顺序
col = ['xx','yy','zz']
df[col]
# 调整单个col顺序
col.remove('xx')
col.insert(-1,'xx')
批量读取文件夹下的文件
# source: https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
# 常规简洁
import glob,os
import pandas as pd
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "my_files*.csv"))))
# dask 包
# dask 介绍: https://www.datalearner.com/blog/1051585624649623
import dask.dataframe as dd
df = dd.read_csv('data*.csv')
df.compute()
pandas grouby value_counts 统计出现最多的组合
# source: https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
source.groupby(['Country','City'])['Short name'].agg(pd.Series.mode)
# agg 一次计算多个对象
data.groupby('month', as_index=False).agg({"duration": "sum"})
# 行转列
data.set_index(['xx','yy']).unstack()
## multiindex become columns
## source: https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns
pd.DataFrame(df.to_records()) # multiindex become columns and new index is integers only
将 es 的搜索结果(py接口)转化为dataframe
# source: https://medium.com/@patni.sonal/elasticsearch-convert-your-index-data-into-pandas-dataframe-30c44ada2407
# 其他方案: https://stackoverflow.com/questions/25186148/creating-dataframe-from-elasticsearch-results
from elasticsearch import Elasticsearch
from pandasticsearch import Select
es = Elasticsearch(['http://localhost:9200'],timeout=600)
documents = es.search(index=’myindex’,body={})
# 核心
pandas_df = Select.from_dict(documents).to_pandas()
python “分页” 提取es的大量数据
# refer:
# https://gist.github.com/drorata/146ce50807d16fd4a6aa#gistcomment-2400782
# https://www.cnblogs.com/WessonStar/p/8360659.html
def scroll(index, doc_type, query_body, page_size=100, debug=False, scroll='2m'):
page = es.search(index=index, doc_type=doc_type, scroll=scroll, size=page_size, body=query_body)
sid = page['_scroll_id']
scroll_size = page['hits']['total']
total_pages = math.ceil(scroll_size/page_size)
page_counter = 0
if debug:
print('Total items : {}'.format(scroll_size))
print('Total pages : {}'.format( math.ceil(scroll_size/page_size) ) )
# Start scrolling
while (scroll_size > 0):
# Get the number of results that we returned in the last scroll
scroll_size = len(page['hits']['hits'])
if scroll_size>0:
if debug:
print('> Scrolling page {} : {} items'.format(page_counter, scroll_size))
yield total_pages, page_counter, scroll_size, page
# get next page
page = es.scroll(scroll_id = sid, scroll = '2m')
page_counter += 1
# Update the scroll ID
sid = page['_scroll_id']
index = 'cases_*'
doc_type = 'detail'
query = { "query": { "match_all": {} }, "_source": ['caseId'] }
page_size = 1000
for total_pages, page_counter, page_items, page_data in scroll(index, doc_type, query, page_size=page_size):
print('total_pages={}, page_counter={}, page_items={}'.format(total_pages, page_counter, page_items))
# do what you need with page_data
pandas groupby agg 获得最大最小数据
# refer: https://stackoverflow.com/questions/25024797/max-and-min-date-in-pandas-groupby
df.groupby([key_field]).agg({'time_field': [np.min,np.max]})
pandas object to datetime
df['DateTime'] = pd.to_datetime(df['DateTime'])
pandas multiindex to single index
# refer: https://www.thetopsites.net/article/51868061.shtml
df2.set_axis(df2.columns.map('_'.join), axis=1, inplace=False).add_suffix('_MPG')
# https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns
df.columns = df.columns.get_level_values(0)
panda 科学记数法关闭/设置显示精度
pd.options.display.float_format = '{:.2f}'.format
类 tqdm 定制化进度条
# refer: https://clay-atlas.com/blog/2019/11/11/python-chinese-tutorial-tqdm-progress-and-ourself/
from time import sleep
temp = 0
total = 1000
for n in range(1000):
temp += 1
print('\r' + '[Progress]:[%s%s]%.2f%%;' % (
'█' * int(temp*20/total), ' ' * (20-int(temp*20/total)),
float(temp/total*100)), end='')
sleep(0.01)
# TODO:如果 n 在其他地方自增,如何不嵌套 tqdm 实现进度条?
计算最近坐标点
# refer: https://stackoverflow.com/questions/62482609/find-nearest-index-in-one-dataframe-to-another
import pandas as pd
import numpy as np
a = {
'X': [1, 2, 5, 7, 10, 5, 2, 3, 24, 21],
'Y': [3, 4, 8, 15, 20, 12, 23, 22, 14, 7],
'Z': [12, 4, 9, 16, 13, 1, 8, 17, 11, 19]
}
b = {
'X': [1, 8, 20, 7, 32],
'Y': [6, 4, 17, 45, 32],
'Z': [52, 12, 6, 8, 31]
}
df1 = pd.DataFrame(a)
df2 = pd.DataFrame(b)
dist = lambda dx,dy,dz: np.sqrt(dx**2+dy**2+dz**2)
def closest(row):
darr = dist(df1['X']-row['X'], df1['Y']-row['Y'], df1['Z']-row['Z'])
idx = np.where(darr == np.amin(darr))[0][0]
return df1['X'][idx], df1['Y'][idx], df1['Z'][idx]
df2['closest'] = df2.apply(closest, axis=1)
print(df2)
pandas 加速
# https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html
# https://medium.com/swlh/6-ways-to-significantly-speed-up-pandas-with-a-couple-lines-of-code-part-1-2c2dfb0de230
pandas 读 json dict
# https://stackoverflow.com/questions/56951109/python-data-frame-read-json-and-filter-data-from-dataframe
# https://stackoverflow.com/questions/46391291/how-to-convert-json-data-inside-a-pandas-column-into-new-columns
import json
import pandas as pd
pd.io.json.json_normalize(df.acList.apply(json.loads))
pandas 读取 excel
# refer: https://medium.com/@chiayinchen/%E4%BD%BF%E7%94%A8-python-pandas-%E5%BE%9E%E5%8C%85%E5%90%AB%E5%A4%9A%E5%80%8B%E5%B7%A5%E4%BD%9C%E8%A1%A8%E7%9A%84-excel-%E4%B8%AD%E6%93%B7%E5%8F%96%E6%95%B8%E6%93%9A-f9fcf507e05b
import pandas as pd
data = pd.read_excel('https://storage.googleapis.com/medium_demo/y0s700000.xls', skiprows = 2, sheet_name = None) # "data" are all sheets as a dictionary
xls = pd.ExcelFile('https://storage.googleapis.com/medium_demo/y0s700000.xls')
xls.sheet_names # see all sheet names
data.get('YOUR_SHEET_NAME_HERE') # get a specific sheet to DataFrame
dataframe 导出为图片
# refer: https://stackoverflow.com/questions/35634238/how-to-save-a-pandas-dataframe-table-as-a-png
import dataframe_image as dfi
df_styled = df.style.background_gradient() #adding a gradient based on values in cell
dfi.export(df_styled,"mytable.png")
获取特定条件下的top数据
# https://stackoverflow.com/questions/41330749/pandas-sort-and-head-inside-groupby
df.groupby(pd.TimeGrouper('D')).apply(lambda x: x.nlargest(2, 'value')).reset_index(level=1, drop=1)
合并多个 dataframe
# https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes
import pandas as pd
from functools import reduce
df1 = pd.read_table('file1.csv', sep=',')
df2 = pd.read_table('file2.csv', sep=',')
df3 = pd.read_table('file3.csv', sep=',')
# compile the list of dataframes you want to merge
data_frames = [df1, df2, df3]
df_merged = reduce(lambda left,right: pd.merge(left,right,on=['DATE'],
how='outer'), data_frames)
pandas 读取剪切板的 mardown
# https://stackoverflow.com/questions/65947246/copy-markdown-formatted-tables-from-so-to-pandas-clipboard
pd.read_clipboard(sep='\s*\|\s*').iloc[1:,1:-1]
pandas groupby reset cumsum 重制累加
# https://stackoverflow.com/questions/45964740/python-pandas-cumsum-with-reset-everytime-there-is-a-0
df = pd.DataFrame([[0,1],[1,1],[0,1],[1,0],[1,1],[0,1]],columns = ['a','b'])
a = df != 0
df1 = a.cumsum()-a.cumsum().where(~a).ffill().fillna(0).astype(int)
print (df1)
pandas 分列
# https://stackoverflow.com/questions/14745022/how-to-split-a-dataframe-string-column-into-two-columns
关于作者