标题 创建时间 发布时间 最后一次修改 备注
EDA 常见问题速查 2020.09.01 2020.12.21 2021.03.30 /

参考网站

可视化工具

三维坐标二维化

#Seaborn pair plot
# source: https://stackoverflow.com/questions/52285104/3d-scatterplots-in-python-with-hue-colormap-and-legend
import pandas as pd
import seaborn as sns

df_3d = pd.DataFrame()
df_3d['x'] = x
df_3d['y'] = y
df_3d['z'] = z

sns.pairplot(df_3d)

韦恩图绘制

  • matplotlib-venn

    from matplotlib_venn import venn2
    venn2([set(['A', 'B', 'C', 'D']), set(['D', 'E', 'F'])])
        
    def plot_venn(col_1,col_2):
        set_1 = set(col_1)
        print(len(set_1))
        set_2 = set(col_2)
        print(len(set_2))
        venn2([set_1,set_2])
    

seaborn 加 title

# refer https://stackoverflow.com/questions/42406233/how-to-add-title-to-seaborn-boxplot
import seaborn as sns
import matplotlib.pyplot as plt

tips = sns.load_dataset("tips")
sns.boxplot(x=tips["total_bill"]).set_title("LaLaLa")

plt.show()

seaboard 存 png

# refer https://stackoverflow.com/questions/32244753/how-to-save-a-seaborn-plot-into-a-file
df = sns.load_dataset('iris')
sns_plot = sns.pairplot(df, hue='species', height=2.5)
sns_plot.figure.savefig("output.png")

绘制3D图

# 3D 二维化,热力图

层次聚类

# refer: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html#handling-multicollinear-features
import matplotlib.pyplot as plt
import numpy as np

from scipy.stats import spearmanr
from scipy.cluster import hierarchy

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(data[features]).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(
    corr_linkage, labels=features, ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

# refer: https://seaborn.pydata.org/generated/seaborn.clustermap.html
# 大数据集下容易挂掉
iris = sns.load_dataset("iris")
species = iris.pop("species")
g = sns.clustermap(iris)

Chord diagram

# 方式一 bokeh
# http://holoviews.org/gallery/demos/matplotlib/route_chord.html
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.airport_routes import routes, airports

hv.extension('bokeh')

# Count the routes between Airports
route_counts = routes.groupby(['SourceID', 'DestinationID']).Stops.count().reset_index()
nodes = hv.Dataset(airports, 'AirportID', 'City')
chord = hv.Chord((route_counts, nodes), ['SourceID', 'DestinationID'], ['Stops'])

# Select the 20 busiest airports
busiest = list(routes.groupby('SourceID').count().sort_values('Stops').iloc[-20:].index.values)
busiest_airports = chord.select(AirportID=busiest, selection_mode='nodes')

busiest_airports.opts(
    opts.Chord(cmap='Category20', edge_color=dim('SourceID').str(), 
               height=800, labels='City', node_color=dim('AirportID').str(), width=800))

# 方式二 
from chord import Chord
matrix = vis_df.values.tolist()
names = list(vis_df.index)
Chord(matrix, names).to_html("chord-diagram-chord-library.html")

关于作者