标题 创建时间 发布时间 最后一次修改 备注
Pandas 常见问题速查 2020.08.04 2020.12.21 2020.12.21 /

一个 dataframe 重复多次,复制内容 n 遍

n = 3
df_repeated = pd.concat([df1]*n, ignore_index=True)

日期分箱

date_bins = pd.date_range('2000-01-01', freq='3D', periods=6) #3天一个步长,总计6个分箱
pd.cut(df.Date,bins=date_bins)

pandas 去重求差集

df_a = pd.DataFrame(aa)
df_b = pd.DataFrame(bb)
df_a = df_a.append(df_b)

df_a.drop_duplicates(subset=['col_1','col_2'],keep=False)

特征相关性热力图绘制

feature_df.corr().style.background_gradient(cmap='coolwarm')

pandas 不折叠行与列

pd.set_option('display.max_rows', 500) #最大行数
pd.set_option('display.max_columns', 500) #最大列数

pandas rename 列名

df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.rename(columns={"A": "a", "B": "c"})

强制关闭 copy warning

# source: https://zhuanlan.zhihu.com/p/41202576 链接中有其他更优解决方案
pd.set_option('mode.chained_assignment', None)

控制 pandas to csv 输出结果顺序

# source: https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name

# 直接排序法
df = df.sort_index(axis=1)

# 重写col顺序
col = ['xx','yy','zz']
df[col]

# 调整单个col顺序
col.remove('xx')
col.insert(-1,'xx')

批量读取文件夹下的文件

# source: https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe

# 常规简洁
import glob,os
import pandas as pd
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "my_files*.csv"))))

# dask 包
# dask 介绍: https://www.datalearner.com/blog/1051585624649623
import dask.dataframe as dd
df = dd.read_csv('data*.csv')
df.compute()

pandas grouby value_counts 统计出现最多的组合

# source: https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
source.groupby(['Country','City'])['Short name'].agg(pd.Series.mode)

将 es 的搜索结果(py接口)转化为dataframe

# source: https://medium.com/@patni.sonal/elasticsearch-convert-your-index-data-into-pandas-dataframe-30c44ada2407
# 其他方案: https://stackoverflow.com/questions/25186148/creating-dataframe-from-elasticsearch-results

from elasticsearch import Elasticsearch
from pandasticsearch import Select

es = Elasticsearch(['http://localhost:9200'],timeout=600)
documents = es.search(index=myindex,body={})

# 核心
pandas_df = Select.from_dict(documents).to_pandas()

python “分页” 提取es的大量数据

# refer: 
# https://gist.github.com/drorata/146ce50807d16fd4a6aa#gistcomment-2400782
# https://www.cnblogs.com/WessonStar/p/8360659.html

def scroll(index, doc_type, query_body, page_size=100, debug=False, scroll='2m'):
    page = es.search(index=index, doc_type=doc_type, scroll=scroll, size=page_size, body=query_body)
    sid = page['_scroll_id']
    scroll_size = page['hits']['total']
    total_pages = math.ceil(scroll_size/page_size)
    page_counter = 0
    if debug: 
        print('Total items : {}'.format(scroll_size))
        print('Total pages : {}'.format( math.ceil(scroll_size/page_size) ) )
    # Start scrolling
    while (scroll_size > 0):
        # Get the number of results that we returned in the last scroll
        scroll_size = len(page['hits']['hits'])
        if scroll_size>0:
            if debug: 
                print('> Scrolling page {} : {} items'.format(page_counter, scroll_size))
            yield total_pages, page_counter, scroll_size, page
        # get next page
        page = es.scroll(scroll_id = sid, scroll = '2m')
        page_counter += 1
        # Update the scroll ID
        sid = page['_scroll_id']
        

index = 'cases_*'
doc_type = 'detail'
query = { "query": { "match_all": {} }, "_source": ['caseId'] }
page_size = 1000

for total_pages, page_counter, page_items, page_data in scroll(index, doc_type, query, page_size=page_size):
    print('total_pages={}, page_counter={}, page_items={}'.format(total_pages, page_counter, page_items))
    # do what you need with page_data

pandas groupby agg 获得最大最小数据

# refer: https://stackoverflow.com/questions/25024797/max-and-min-date-in-pandas-groupby
df.groupby([key_field]).agg({'time_field': [np.min,np.max]})

pandas object to datetime

df['DateTime'] = pd.to_datetime(df['DateTime'])

pandas multiindex to single index

# refer: https://www.thetopsites.net/article/51868061.shtml
df2.set_axis(df2.columns.map('_'.join), axis=1, inplace=False).add_suffix('_MPG')

panda 科学记数法关闭/设置显示精度

pd.options.display.float_format = '{:.2f}'.format

类 tqdm 定制化进度条

# refer: https://clay-atlas.com/blog/2019/11/11/python-chinese-tutorial-tqdm-progress-and-ourself/
from time import sleep

temp = 0
total = 1000

for n in range(1000):
    temp += 1
    print('\r' + '[Progress]:[%s%s]%.2f%%;' % (
    '█' * int(temp*20/total), ' ' * (20-int(temp*20/total)),
    float(temp/total*100)), end='')
    sleep(0.01)
    
# TODO:如果 n 在其他地方自增,如何不嵌套 tqdm 实现进度条?

计算最近坐标点

# refer: https://stackoverflow.com/questions/62482609/find-nearest-index-in-one-dataframe-to-another
import pandas as pd
import numpy as np
a = {
   'X': [1, 2, 5, 7, 10, 5, 2, 3, 24, 21],
   'Y': [3, 4, 8, 15, 20, 12, 23, 22, 14, 7],
   'Z': [12, 4, 9, 16, 13, 1, 8, 17, 11, 19]
 }
b = {
   'X': [1, 8, 20, 7, 32],
   'Y': [6, 4, 17, 45, 32],
   'Z': [52, 12, 6, 8, 31]
 }
df1 = pd.DataFrame(a)
df2 = pd.DataFrame(b)

dist = lambda dx,dy,dz: np.sqrt(dx**2+dy**2+dz**2)

def closest(row):
    darr = dist(df1['X']-row['X'], df1['Y']-row['Y'], df1['Z']-row['Z'])
    idx = np.where(darr == np.amin(darr))[0][0]
    return df1['X'][idx], df1['Y'][idx], df1['Z'][idx]

df2['closest'] = df2.apply(closest, axis=1)
print(df2)

pandas 加速

# https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html
# https://medium.com/swlh/6-ways-to-significantly-speed-up-pandas-with-a-couple-lines-of-code-part-1-2c2dfb0de230

pandas 读 json dict

# https://stackoverflow.com/questions/56951109/python-data-frame-read-json-and-filter-data-from-dataframe

pandas 读取 excel

# refer: https://medium.com/@chiayinchen/%E4%BD%BF%E7%94%A8-python-pandas-%E5%BE%9E%E5%8C%85%E5%90%AB%E5%A4%9A%E5%80%8B%E5%B7%A5%E4%BD%9C%E8%A1%A8%E7%9A%84-excel-%E4%B8%AD%E6%93%B7%E5%8F%96%E6%95%B8%E6%93%9A-f9fcf507e05b

import pandas as pd

data = pd.read_excel('https://storage.googleapis.com/medium_demo/y0s700000.xls', skiprows = 2, sheet_name = None) # "data" are all sheets as a dictionary

xls = pd.ExcelFile('https://storage.googleapis.com/medium_demo/y0s700000.xls')
xls.sheet_names # see all sheet names
data.get('YOUR_SHEET_NAME_HERE') # get a specific sheet to DataFrame

关于作者