标题 | 创建时间 | 发布时间 | 最后一次修改 | 备注 |
---|---|---|---|---|
EDA 常见问题速查 | 2020.09.01 | 2020.12.21 | 2021.03.30 | / |
参考网站
可视化工具
三维坐标二维化
#Seaborn pair plot
# source: https://stackoverflow.com/questions/52285104/3d-scatterplots-in-python-with-hue-colormap-and-legend
import pandas as pd
import seaborn as sns
df_3d = pd.DataFrame()
df_3d['x'] = x
df_3d['y'] = y
df_3d['z'] = z
sns.pairplot(df_3d)
韦恩图绘制
-
matplotlib-venn
from matplotlib_venn import venn2 venn2([set(['A', 'B', 'C', 'D']), set(['D', 'E', 'F'])]) def plot_venn(col_1,col_2): set_1 = set(col_1) print(len(set_1)) set_2 = set(col_2) print(len(set_2)) venn2([set_1,set_2])
seaborn 加 title
# refer https://stackoverflow.com/questions/42406233/how-to-add-title-to-seaborn-boxplot
import seaborn as sns
import matplotlib.pyplot as plt
tips = sns.load_dataset("tips")
sns.boxplot(x=tips["total_bill"]).set_title("LaLaLa")
plt.show()
seaboard 存 png
# refer https://stackoverflow.com/questions/32244753/how-to-save-a-seaborn-plot-into-a-file
df = sns.load_dataset('iris')
sns_plot = sns.pairplot(df, hue='species', height=2.5)
sns_plot.figure.savefig("output.png")
绘制3D图
# 3D 二维化,热力图
层次聚类
# refer: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html#handling-multicollinear-features
# 参数说:https://haojunsui.github.io/2016/07/16/scipy-hac/
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(data[features]).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(
corr_linkage, labels=features, ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro['ivl']))
ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()
# refer: https://seaborn.pydata.org/generated/seaborn.clustermap.html
# 大数据集下容易挂掉
iris = sns.load_dataset("iris")
species = iris.pop("species")
g = sns.clustermap(iris)
Chord diagram
# 方式一 bokeh
# http://holoviews.org/gallery/demos/matplotlib/route_chord.html
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.airport_routes import routes, airports
hv.extension('bokeh')
# Count the routes between Airports
route_counts = routes.groupby(['SourceID', 'DestinationID']).Stops.count().reset_index()
nodes = hv.Dataset(airports, 'AirportID', 'City')
chord = hv.Chord((route_counts, nodes), ['SourceID', 'DestinationID'], ['Stops'])
# Select the 20 busiest airports
busiest = list(routes.groupby('SourceID').count().sort_values('Stops').iloc[-20:].index.values)
busiest_airports = chord.select(AirportID=busiest, selection_mode='nodes')
busiest_airports.opts(
opts.Chord(cmap='Category20', edge_color=dim('SourceID').str(),
height=800, labels='City', node_color=dim('AirportID').str(), width=800))
# 方式二
from chord import Chord
matrix = vis_df.values.tolist()
names = list(vis_df.index)
Chord(matrix, names).to_html("chord-diagram-chord-library.html")
matplot mac中文字符处理
# https://www.cnblogs.com/cymwill/p/10554916.html
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
# 或者下载字体
# https://stackoverflow.com/questions/35668219/how-to-set-up-a-custom-font-with-custom-path-to-matplotlib-global-font
! wget https://raw.githubusercontent.com/StellarCN/scp_zh/master/fonts/SimHei.ttf