接下来说说R语言作图,这两年生物信息学的爆炸,R语言由于在多元统计方面的优势,以及比较简单的学习过程,受到很多人的青睐,Hadley的ggplot2风靡一时,好多文章纷纷使用,主要是因为基于Grammar of Graphics的思想,将画图简单化、符合人们的思维方式。然而python的作图库matplotlib则完全相反,虽然非常强大,但是学习起来难度很大。给个code对比就可以发现:
importmatplotlib.pyplotaspltimportseabornassnssns.set_style('ticks', {'font.family':'sans-serif', 'font.serif':'Helvetica'})# Load the example Titanic datasettitanic = sns.load_dataset("titanic")print(titanic.head())# Draw a nested barplot to show survival for class and sexg = sns.factorplot(x="class", y="survived", hue="sex", data=titanic, kind="bar", palette="muted")# g.despine(left=True)g.set_ylabels("survival probability")g.set_xlabels("Cagegories")plt.show()
importmatplotlib# Say, "the default sans-serif font is COMIC SANS"matplotlib.rcParams['font.sans-serif'] ="Helvetica"# Then, "ALWAYS use sans-serif fonts"matplotlib.rcParams['font.family'] ="sans-serif"importnumpyasnpimportmatplotlib.pylabaspylabimportmatplotlib.pyplotaspltparams={
'axes.labelsize': '14',
'xtick.labelsize':'14',
'ytick.labelsize':'14',
'lines.linewidth':1,
'legend.fontsize': '14',
'figure.figsize' : '6, 6'}pylab.rcParams.update(params)importnumpyasnpfromsklearn.datasetsimport load_irisfromsklearn.preprocessingimport scaleimportscipyimportmatplotlib.pyplotaspltimportpandasaspdfromsklearnimport decomposition# Load Iris datadata = load_iris()x = data['data']y = data['target']Se_targets = pd.Series(load_iris().target,
index = ["iris_%d"% i for i inrange(load_iris().data.shape[0])],
name ="Species")print (y)# Since PCA is an unsupervised method, we will not be using the target variable y#scale the data matrix x to have zero mean and unit standard deviation. The rule of thumb is that if all your columns are measured in the same scale in your data and have the same unit of measurement, you don’t have to scale the data. This will allow PCA to capture these basic units with the maximum variation:x_s = scale(x,with_mean=True, with_std=True, axis=0)# Calculate correlation matrixx_c = np.corrcoef(x_s.T)# Find eigen value and eigen vector from correlation matrixeig_val,r_eig_vec = scipy.linalg.eig(x_c)#print "PCP': "#print r_eig_vec.dot(x_c).dot(r_eig_vec.T);# Select the first two eigen vectors.w = r_eig_vec[:,0:2]#Project the dataset in to the dimension from 4 dimension to 2 using the right eignen vectorx_rd = x_s.dot(w)fig = plt.figure(1, figsize=(5, 4
))# Scatter plot the new two dimensionsplt.scatter(x_rd[:,0],x_rd[:,1],marker='o',c=y)plt.xlabel("PCA 1 (52.1%)")plt.ylabel("PCA 2 (11.3%)")plt.savefig("pca_plot_012.pdf",bbox_inches="tight")plt.show()## using sklearnfig = plt.figure(2, figsize=(5, 4))pca = decomposition.PCA(n_components=2)pca.fit(x)x = pca.transform(x)df= pd.DataFrame(x,columns=['PC1','PC2'],index=Se_targets.index)df.to_csv('iris_pca_result.csv',header=True, index=True)print(x)plt.scatter(x[:,0],x[:,1],marker='o',c=y)#plt.show()# using ggplot2fromplotnineimport*fromplotnine.dataimport*df['group']=Se_targets# print(df)p1 = (ggplot(df, aes(x='PC1', y='PC2',color='factor(group)',shape='factor(group)'))+theme_matplotlib()+
geom_point(size=3)+scale_shape_manual(values=['^','o','s'])+
scale_color_manual(values=['#FC0D1C', '#1AA68C', '#F08221'])+
theme(axis_text=element_text(size=14,color='black',family='sans-serif'),axis_title=element_text(size=14,color='black',family='sans-serif'),legend_text= element_text(size=12,color='black',family='sans-serif'))+
theme(legend_position=(0.8,0.25),legend_background=element_rect(alpha=0))+
guides(color=guide_legend(nrow=3),shape=guide_legend(nrow=3))+labs(color='',shape='')
)p1.save('iris_pcoa_plotnine.pdf',width=6
, height=5.5)