import pandas as pd
import numpy as np
from matplotlib import pyplot
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import gc, argparse, sys, os, errno
from IPython.core.display import HTML,Image
from functools import reduce
import h5py
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import scipy
import sklearn
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
from bokeh.io import output_notebook, show
output_notebook()
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics import roc_curve,roc_auc_score,auc
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
from sklearn.neighbors import NearestNeighbors
from bokeh.palettes import Category20c,Set3,Pastel2
from ipywidgets import interact,interactive, FloatSlider,IntSlider, RadioButtons,Dropdown,Tab,Text
from IPython.core.display import HTML,Image
from matplotlib.backends.backend_pdf import PdfPages, PdfFile
from IPython.display import HTML, display, FileLink
from base64 import b64encode, b64decode
from io import StringIO, BytesIO
from contextlib import contextmanager
cd ~chenxupeng/projects/exSEEK_training/
# setup figure template
figure_template_path = 'bin'
if figure_template_path not in sys.path:
sys.path.append(figure_template_path)
from importlib import reload
import figure_template
# force reload of the module
reload(figure_template)
from figure_template import display_dataframe, embed_pdf_figure, embed_pdf_pages,std_plot
fontsize = 6.5
fontscale = 1
fontweight = 'normal'
fonttitle = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontlabel = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontticklabel = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontlegend = {'family':'Arial',
'weight' : fontweight,
#'linewidth':0.5,
'size' : fontsize*fontscale}
fontcbarlabel = {'family':'Arial',
'weight' : fontweight,
#'Rotation' : 270,
#'labelpad' : 25,
'size' : fontsize*fontscale}
fontcbarticklabel = {'family':'Arial',#Helvetica
'weight' : fontweight,
'size' : (fontsize-1)*fontscale}
def std_plot(ax,xlabel=None,ylabel=None,title=None,
legendtitle=None,bbox_to_anchor=None,
labelspacing=0.2,borderpad=0.2,handletextpad=0.2,legendsort=False,markerscale=None,
xlim=None,ylim=None,
xbins=None,ybins=None,
cbar=None,cbarlabel=None,
moveyaxis=False,sns=False,left=True,rotation=None,xticklabel=None,legendscale=True,h=None,l=None,**kwards):
# height = 2 font = 6.5
def autoscale(fig):
if isinstance(fig,matplotlib.figure.Figure):
width,height = fig.get_size_inches()
elif isinstance(fig,matplotlib.axes.Axes):
width,height = fig.figure.get_size_inches()
fontscale = height/2
if width/fontscale > 8:
warnings.warn("Please reset fig's width. When scaling the height to 2 in, the scaled width '%.2f' is large than 8"%(width/fontscale),UserWarning)
return fontscale
class fontprop:
def init(self,fonttitle=None,fontlabel=None,fontticklabel=None,fontlegend=None,fontcbarlabel=None,fontcbarticklabel=None):
self.fonttitle = fonttitle
self.fontlabel = fontlabel
self.fontticklabel = fontticklabel
self.fontlegend = fontlegend
self.fontcbarlabel = fontcbarlabel
self.fontcbarticklabel = fontcbarticklabel
def update(self,fontscale):
self.fonttitle['size'] = self.fonttitle['size']*fontscale
self.fontlabel['size'] = self.fontlabel['size']*fontscale
self.fontticklabel['size'] = self.fontticklabel['size']*fontscale
self.fontlegend['size'] = self.fontlegend['size']*fontscale
self.fontcbarlabel['size'] = self.fontcbarlabel['size']*fontscale
self.fontcbarticklabel['size'] = self.fontcbarticklabel['size']*fontscale
def reset(self,fontscale):
self.fonttitle['size'] = self.fonttitle['size']/fontscale
self.fontlabel['size'] = self.fontlabel['size']/fontscale
self.fontticklabel['size'] = self.fontticklabel['size']/fontscale
self.fontlegend['size'] = self.fontlegend['size']/fontscale
self.fontcbarlabel['size'] = self.fontcbarlabel['size']/fontscale
self.fontcbarticklabel['size'] = self.fontcbarticklabel['size']/fontscale
fontscale = autoscale(ax)
font = fontprop()
font.init(fonttitle,fontlabel,fontticklabel,fontlegend,fontcbarlabel,fontcbarticklabel)
font.update(fontscale)
pyplot.draw()
#plt.figure(linewidth=30.5)
if xlim is not None:
ax.set(xlim=xlim)
if ylim is not None:
ax.set(ylim=ylim)
#pyplot.draw()
if xbins is not None:
locator = MaxNLocator(nbins=xbins)
locator.set_axis(ax.xaxis)
ax.set_xticks(locator())
if ybins is not None:
locator = MaxNLocator(nbins=ybins)
locator.set_axis(ax.yaxis)
ax.set_yticks(locator())
pyplot.draw()
ax.set_xticks(ax.get_xticks())
ax.set_yticks(ax.get_yticks())
ax.set_xlabel(xlabel,fontdict = font.fontlabel,labelpad=(fontsize-1)*fontscale)
ax.set_ylabel(ylabel,fontdict = font.fontlabel,labelpad=(fontsize-1)*fontscale)
if (rotation is not None) & (xticklabel is not None) :
ax.set_xticklabels(xticklabel,fontticklabel,rotation=rotation)
elif (xticklabel is not None) &(rotation is None):
ax.set_xticklabels(xticklabel,fontticklabel)
elif (xticklabel is None) &(rotation is None):
ax.set_xticklabels(ax.get_xticklabels(),fontticklabel)
elif (rotation is not None) & (xticklabel is None):
ax.set_xticklabels(ax.get_xticklabels(),fontticklabel,rotation=rotation)
ax.set_yticklabels(ax.get_yticklabels(),font.fontticklabel)
if moveyaxis is True:
#fontticklabel
ax.spines['left'].set_position(('data',0))
ax.spines['left'].set_visible(left)
ax.spines['right'].set_visible(not left)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_linewidth(0.5*fontscale)
ax.spines['bottom'].set_linewidth(0.5*fontscale)
ax.spines['left'].set_linewidth(0.5*fontscale)
ax.spines['bottom'].set_color('k')
ax.spines['left'].set_color('k')
ax.spines['right'].set_color('k')
ax.tick_params(direction='out', pad=2*fontscale,width=0.5*fontscale)
#ax.spines['bottom']._edgecolor="#000000"
#ax.spines['left']._edgecolor="#000000"
if title is not None:
ax.set_title(title,fontdict = font.fonttitle)
if legendscale is True:
if (h is None)&(l is None):
legend = ax.legend(prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
else:
legend = ax.legend(h,l,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
if legendtitle is not None:
#if legendloc is None:
# legendloc="best"
legend = ax.legend(title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if legendsort is True:
# h: handle l:label
h,l = ax.get_legend_handles_labels()
l,h = zip(*sorted(zip(l,h), key=lambda t: int(t[0])))
legend = ax.legend(h,l,title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if sns is True:
h,l = ax.get_legend_handles_labels()
#l,h = zip(*sorted(zip(l,h), key=lambda t: int(t[0])))
legend = ax.legend(h[1:],l[1:],title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
else:
legend = ax.legend(handles=h,labels=l,title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if cbar is not None:
#locator, formatter = cbar._get_ticker_locator_formatter()
#ticks, ticklabels, offset_string = cbar._ticker(locator, formatter)
#cbar.ax.spines['top'].set_visible(False)
#cbar.ax.spines['right'].set_visible(False)
#cbar.ax.spines['bottom'].set_visible(False)
#cbar.ax.spines['left'].set_visible(False)
cbar.ax.tick_params(direction='out', pad=3*fontscale,width=0*fontscale,length=0*fontscale)
cbar.set_label(cbarlabel,fontdict = font.fontcbarlabel,Rotation=270,labelpad=fontscale*(fontsize+1))
cbar.ax.set_yticks(cbar.ax.get_yticks())
cbar.ax.set_yticklabels(cbar.ax.get_yticklabels(),font.fontcbarticklabel)
font.reset(fontscale)
return ax
savepath = '/home/chenxupeng/projects/exSEEK_training/'+'output/'+'fig3'+'/'
if not os.path.exists(savepath):
os.mkdir(savepath)
def legendhandle(lists,porm=True,order=0):
'''
input: array,porm palette or marker
palettesorder=0 dataset Category20c
palettesorder=1 batch
return a dic mapping levels of the hue variable to colors
or return a dic mapping levels of the style variable to markers
when use sns function, set palette=dic or markers=dic
'''
if porm == True:
if order == 0:
palette = np.array(Category20c[20]).reshape(4,-1).T.ravel()
if order == 1:
palette = Set3[12]
lists.sort()
dic={}
for i in range(len(lists)):
dic[lists[i]]=palette[i]
return dic
else:
markerlist1 = ['v','^','<','>'] #triangle_down triangle_up triangle_left triangle_left
markerlist2 = ['P','o','X','s'] #plus (filled) circle x (filled) square
#markerlist3 = ['$CPM$','$CPM_top$','$RLE$','$TMM$']
markerlist3 = ['$f$','$g$','$h$','$l$']
markerlist3.sort()
if order == 0:
markers = markerlist2
if order == 1:
markers = markerlist1
if order == 2:
markers = markerlist3
lists.sort()
dic={}
for i in range(len(lists)):
dic[lists[i]]=markers[i]
return dic
#titlelist=['CPM','CPM-top','RLE','TMM']
titlelist=['RLE']
j=0
for i in tqdm((methodlist)):
table = pd.read_table('/home/xieyufeng/fig3/output/'+'cfRNA'+'/matrix_processing/'+i+'.mirna_and_domains.txt',
index_col=0)
plotRLE(table,batch=class_info,label='dataset',path=savepath,filename='RLE_leg_big{}.eps'.format(i),title=titlelist[j])
j=j+1
def plotRLE(mat,batch=None,label=None,logged=False,path=None,filename=None,title=None):
"""
mat: DataFrame, expression matrix
batch: DataFrame, optional, if given, batch.index must be contained in mat.columns
label: One of batch.columns by which the samples are grouped in the figure
"""
log_mat = mat if logged else np.log2(mat+1)
feature_meds = log_mat.apply(np.median,1).tolist()
for i in np.arange(len(feature_meds)):
log_mat.iloc[i] = log_mat.iloc[i] - feature_meds[i]
mat_rle = log_mat
distance = 0
for i in range(mat_rle.shape[1]):
small,large = np.percentile(mat_rle.iloc[:,i], [25, 75])
distance = distance+(large-small)**2
score = distance/mat_rle.shape[1]
stack = mat_rle.stack().reset_index()
stack.rename(columns={stack.columns[2]:"counts", stack.columns[1]: "index"},inplace=True)
#stack['class'] = None
if batch is not None:
batch.index.name = 'index'
batch = batch[label].reset_index()
stack = pd.merge(stack, batch, on=['index'])
fig,ax = plt.subplots(figsize=(2,2))
#ax = sns.boxplot(x='index',y='counts',data=stack.sort_values(by=label),fliersize=0,linewidth=0.1,width=1,hue=label,hue_order=np.unique(np.array(stack.loc[:,label])).sort(),dodge=False)
ax = sns.boxplot(x='index',y='counts',data=stack.sort_values(by=label),
fliersize=0,linewidth=0,width=0.8,hue=label,
hue_order=np.unique(np.array(stack.loc[:,label])).sort(),
notch = True,showfliers=False,showmeans=False,showcaps=False,whiskerprops=dict(linewidth=0.5,color='#D8D8D8'),
dodge=False,palette=legendhandle(np.unique(stack.dataset)))
ax.annotate('variation score: %.2f'%score,xy=(mat_rle.shape[1]*0.4,-9),
fontfamily='Arial',fontsize=5.5)
ax.set(xticks=[])
std_plot(ax,'samples','Relative log expression',legendtitle='label',legendsort=False,title=title,ybins=4,bbox_to_anchor=(1.1,1.1))#,ylim=[-4,4])
else:
fig,ax = plt.subplots(figsize=(3.3,2))
ax = sns.boxplot(x='index',y='counts',data=stack,fliersize=0,linewidth=0.1,width=1,color='g')
ax.set(xticks=[])
std_plot(ax,'sample','RLE',legendtitle='label',legendsort=False,ylim=[-10,10],title=title,ybins=4)
#ax.legend_.remove()
legend = ax.legend(prop=fontlegend,
#labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,bbox_to_anchor=(1.05, 0.75))
ax.legend_.get_frame()._linewidth=0
ax.legend_.remove()
ax.spines['bottom'].set_visible(False)
fig.tight_layout()
#embed_pdf_figure()
#fig.savefig(path+filename)
def heterogeneity(matlist=methodlist,class_info=batch_info,featurenum=4,featurename=None):
def get_box_data(boxPlotter, boxName):
"""
boxName can be either a name "cat" or a tuple ("cat", "hue")
Here we really have to duplicate seaborn code, because there is not direct access to the
box_data in the BoxPlotter class.
"""
cat = boxName
i = boxPlotter.group_names.index(cat)
group_data = boxPlotter.plot_data[i]
return group_data
def find_x_position_box(boxPlotter, boxName):
cat = boxName
groupPos = boxPlotter.group_names.index(cat)
return groupPos
classname = np.unique(class_info.label)
classname.sort()
colormap = pd.DataFrame(np.array([classname,tableau10m[:len(np.unique(class_info.label))].tolist()]))
colormap = colormap.T
colormap.columns=['label','color']
class_info = class_info.drop(np.where(class_info.label=='GSE123972')[0]).drop(np.where(class_info.label=='GSE94582')[0]).set_index('sample_id').reset_index()
samplemin = np.unique(class_info.label,return_counts=True)[1].min()
new_class_info = pd.DataFrame([])
for i in unique(class_info.label):
extra_class_info = class_info.iloc[np.where(class_info.label==i)]
new_class_info = new_class_info.append(extra_class_info.sample(n=samplemin))
new_class_info = new_class_info.sort_values(by=['label','sample_id']).set_index('sample_id').reset_index()
flag = 0
plot = pd.DataFrame()
for matname in matlist:
mat = pd.read_table('/home/shibinbin/projects/exSeek-dev/output/cfRNA/matrix_processing/'\
+matname,index_col=0)
mat = mat.loc[:,new_class_info.sample_id]
data = np.log2(mat.iloc[np.where(np.isin([i.split('|')[0] for i in mat.index],ref))]+1)
if flag == 0:
featurelist = pd.DataFrame(data.sum(axis=1))
featurelist.columns=['counts']
ref_del = featurelist.sort_values(by='counts',ascending=False).index[:featurenum].tolist()
data_del = data.loc[ref_del]
stack = pd.DataFrame(data_del.stack())
stack = stack.reset_index()
stack.columns=['feature','sample_id','log2(count+1)']
merge = pd.merge(stack,new_class_info,on=['sample_id'])
merge['state'] = matname
plot = plot.append(merge)
plot['name_state']=[plot.feature.iloc[i].split('|')[0]+'|'+plot.state.iloc[i] for i in range(len(plot.feature))]
#plot = plot.sort_values(by=['name_state'])
plot = plot.set_index('feature').reset_index()
for feature in np.unique(plot.feature):
if (feature.split('|')[0]==featurename)|(featurename==None):
data_sub = plot.iloc[np.where(plot.feature == feature)]
data_sub = data_sub.set_index('feature').reset_index()
#colormap = pd.DataFrame(np.array([np.unique(data_sub.label),np.array(Category20c[20]).reshape(4,-1).T.ravel()[:len(np.unique(data_sub.label))].tolist()]))
#colormap = colormap.T
#colormap.columns=['label','color']
#data_sub = data_sub.merge(colormap)
data_sub = pd.merge(data_sub, colormap, how='left', on=['label'])
fig,ax = plt.subplots(figsize=(3,2))
ylist=[0]*len(matlist)
data_sub_sub={}
merge=pd.DataFrame()
datasetprop=pd.DataFrame()
for label in np.unique(data_sub.label):
data_sub_sub[label] = data_sub.iloc[np.where(data_sub.label == label)]
#data_sub_sub[label].to_csv('./'+label+'.txt',sep='\t')
for i in np.unique(data_sub_sub[label].state):
a = data_sub_sub[label][data_sub_sub[label].state==i]
datasetprop.loc['var',i]=np.var(a['log2(count+1)'],ddof=1)
datasetprop.loc['mean',i]=np.mean(a['log2(count+1)'])
#score['mean']=np.mean(a['log2(count+1)'])
score = pd.DataFrame(datasetprop.stack()).reset_index()
score['dataset']='GSE113994'
merge = merge.append(score)
data_sub_sub[label]['state_sample_id'] = [data_sub_sub[label].state.iloc[i]+'|'+\
str(i) for i in range(len(data_sub_sub[label]))]
sns.pointplot(ax=ax,x="state_sample_id", y="log2(count+1)",palette=data_sub_sub[label].color,hue=data_sub_sub[label].label,
data=data_sub_sub[label],scale=0.2)
#ax.scatter(data_sub_sub.state_sample_id.tolist(),data_sub_sub['log2(count+1)'].tolist(),color=data_sub_sub.color.tolist())
boxPlotter = sns.categorical._BoxPlotter(data=data_sub_sub[label],x='state_sample_id',y='log2(count+1)',hue=data_sub_sub[label].label,
order=None, hue_order=None,
orient=None, width=.8, color=None, palette=None, saturation=.75,
dodge=True, fliersize=5, linewidth=None)
linenum = len(matlist)
start = ax.get_xticks()[0]
binwidth = math.ceil((ax.get_xticks()[0]+ax.get_xticks()[-1])/linenum)
for loc in range(linenum):
box = [boxPlotter.group_names[i] for i in range(start+loc*binwidth,start+(loc+1)*binwidth)]
box_data = []
for i in box:
box_data.append(get_box_data(boxPlotter, i)[0])
ylim = ax.get_ylim()
yRange = ylim[1] - ylim[0]
lineYOffsetAxesCoord = 0.05
lineYOffsetToBoxAxesCoord = 0.06
lineHeightAxesCoord=0.02
yOffset = lineYOffsetAxesCoord*yRange
yOffsetToBox = lineYOffsetToBoxAxesCoord*yRange
ymax = np.array(box_data).max()
y = ymax + yOffsetToBox
if y>=ylist[loc]:
ylist[loc]=y
merge.rename(columns={merge.columns[0]:"prop",
merge.columns[1]: "state",
merge.columns[2]: "value"},inplace=True)
plotprop=pd.DataFrame()
for i in np.unique(merge.state):
b = merge[merge.state==i]
mean = b[b.prop=='mean'].value
mean_var=np.var(mean,ddof=1)
var= b[b.prop=='var'].value
mul = 1
for item in var:
mul *= item
plotprop.loc[i,'mean_var']=mean_var
plotprop.loc[i,'var_mul']=mul
plotprop=plotprop.rename(index={#'filter.null.Norm_CPM.mirna_and_domains.txt':'CPM',
#'filter.null.Norm_CPM_top.mirna_and_domains.txt':'CPM-top',
'filter.null.Norm_RLE.mirna_and_domains.txt':'RLE',
#'filter.null.Norm_TMM.mirna_and_domains.txt':'TMM',
'filter.null.mirna_and_domains.txt':'Raw'})
display(plotprop)
h = lineHeightAxesCoord*yRange
#title = [i.split('.')[2] for i in matlist]
#title = ['Raw' if x == 'mirna_and_domains' else x for x in title]
#title = ['Raw','RLE','CPM','CPM-top','TMM']
title = ['Raw','RLE']
for loc in range(linenum):
lineX, lineY = [start+loc*binwidth,start+(loc+1)*binwidth], [ylist[loc]+h,ylist[loc]+h]
ax.plot(lineX, lineY,color='Black',linewidth='0.5')
ax.annotate(title[loc]+'\n'+'$b$: '+'%.2f'%(plotprop.loc[title[loc],'mean_var'])+' $w$: '+'%.2f'%(plotprop.loc[title[loc],'var_mul']),
xy=(np.mean([start+loc*binwidth,start+(loc+1)*binwidth]), ylist[loc]+h),
xytext=(0, 1), textcoords='offset points',
xycoords='data', ha='center', va='bottom', fontfamily='Arial',fontsize=5.5,
clip_on=False, annotation_clip=False)
ax.spines['bottom'].set_visible(False)
ax.set_xticks([])
#ax.legend(h,l,prop=fontlegend)
#std_plot(ax,'','',legendtitle='label',legendsort=False,title=feature.split('|')[0])
#ax.legend_.remove()
std_plot(ax,'','Normalized counts',legendtitle='label',legendsort=False,title='Heterogeneity of '+feature.split('|')[0]+' expression')
legend = ax.legend(prop=fontlegend,
#labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,bbox_to_anchor=(1, 1.0),framealpha=0,markerscale=5)
ax.legend_.get_frame()._linewidth=0
#ax.legend_.remove()
fig.tight_layout()
#embed_pdf_figure()
#fig.savefig(savepath+'{}_heterogeneity_noleg_2.eps'.format(feature.split('|')[0]))
#heterogeneity()
heterogeneity(featurename='hsa-miR-21-5p')
heterogeneity(featurenum=8)
def heterogeneity(matlist=methodlist,class_info=batch_info,featurenum=4,featurename1=None,featurename2=None):
def get_box_data(boxPlotter, boxName):
"""
boxName can be either a name "cat" or a tuple ("cat", "hue")
Here we really have to duplicate seaborn code, because there is not direct access to the
box_data in the BoxPlotter class.
"""
cat = boxName
i = boxPlotter.group_names.index(cat)
group_data = boxPlotter.plot_data[i]
return group_data
def find_x_position_box(boxPlotter, boxName):
cat = boxName
groupPos = boxPlotter.group_names.index(cat)
return groupPos
classname = np.unique(class_info.label)
classname.sort()
colormap = pd.DataFrame(np.array([classname,tableau10m[:len(np.unique(class_info.label))].tolist()]))
colormap = colormap.T
colormap.columns=['label','color']
class_info = class_info.drop(np.where(class_info.label=='GSE123972')[0]).drop(np.where(class_info.label=='GSE94582')[0]).set_index('sample_id').reset_index()
samplemin = np.unique(class_info.label,return_counts=True)[1].min()
new_class_info = pd.DataFrame([])
for i in unique(class_info.label):
extra_class_info = class_info.iloc[np.where(class_info.label==i)]
new_class_info = new_class_info.append(extra_class_info.sample(n=samplemin))
new_class_info = new_class_info.sort_values(by=['label','sample_id']).set_index('sample_id').reset_index()
flag = 0
plot = pd.DataFrame()
for matname in matlist:
mat = pd.read_table('/home/shibinbin/projects/exSeek-dev/output/cfRNA/matrix_processing/'\
+matname,index_col=0)
mat = mat.loc[:,new_class_info.sample_id]
data = np.log2(mat.iloc[np.where(np.isin([i.split('|')[0] for i in mat.index],ref))]+1)
if flag == 0:
featurelist = pd.DataFrame(data.sum(axis=1))
featurelist.columns=['counts']
ref_del = featurelist.sort_values(by='counts',ascending=False).index[:featurenum].tolist()
data_del = data.loc[ref_del]
stack = pd.DataFrame(data_del.stack())
stack = stack.reset_index()
stack.columns=['feature','sample_id','log2(count+1)']
merge = pd.merge(stack,new_class_info,on=['sample_id'])
merge['state'] = matname
plot = plot.append(merge)
plot['name_state']=[plot.feature.iloc[i].split('|')[0]+'|'+plot.state.iloc[i] for i in range(len(plot.feature))]
#plot = plot.sort_values(by=['name_state'])
plot = plot.set_index('feature').reset_index()
fig,ax = plt.subplots(figsize=(5,2))
for feature in np.unique(plot.feature):
if feature.split('|')[0]==featurename1:
print(1)
data_sub = plot.iloc[np.where(plot.feature == feature)]
data_sub = data_sub.set_index('feature').reset_index()
#colormap = pd.DataFrame(np.array([np.unique(data_sub.label),np.array(Category20c[20]).reshape(4,-1).T.ravel()[:len(np.unique(data_sub.label))].tolist()]))
#colormap = colormap.T
#colormap.columns=['label','color']
#data_sub = data_sub.merge(colormap)
data_sub = pd.merge(data_sub, colormap, how='left', on=['label'])
ylist=[0]*len(matlist)
data_sub_sub={}
merge=pd.DataFrame()
datasetprop=pd.DataFrame()
for label in np.unique(data_sub.label):
data_sub_sub[label] = data_sub.iloc[np.where(data_sub.label == label)]
#data_sub_sub[label].to_csv('./'+label+'.txt',sep='\t')
for i in np.unique(data_sub_sub[label].state):
a = data_sub_sub[label][data_sub_sub[label].state==i]
datasetprop.loc['var',i]=np.var(a['log2(count+1)'],ddof=1)
datasetprop.loc['mean',i]=np.mean(a['log2(count+1)'])
#score['mean']=np.mean(a['log2(count+1)'])
score = pd.DataFrame(datasetprop.stack()).reset_index()
score['dataset']='GSE113994'
merge = merge.append(score)
data_sub_sub[label]['state_sample_id'] = [data_sub_sub[label].state.iloc[i]+'|'+\
str(i) for i in range(len(data_sub_sub[label]))]
d=data_sub_sub[label]
d.label = [ i+ '|' + featurename1 for i in d.label]
sns.pointplot(ax=ax,x="state_sample_id", y="log2(count+1)",palette=d.color,hue=d.label,
data=d,scale=0.5)
#ax.scatter(data_sub_sub.state_sample_id.tolist(),data_sub_sub['log2(count+1)'].tolist(),color=data_sub_sub.color.tolist())
boxPlotter = sns.categorical._BoxPlotter(data=data_sub_sub[label],x='state_sample_id',y='log2(count+1)',hue=data_sub_sub[label].label,
order=None, hue_order=None,
orient=None, width=.8, color=None, palette=None, saturation=.75,
dodge=True, fliersize=5, linewidth=None)
linenum = len(matlist)
start = ax.get_xticks()[0]
binwidth = math.ceil((ax.get_xticks()[0]+ax.get_xticks()[-1])/linenum)
for loc in range(linenum):
box = [boxPlotter.group_names[i] for i in range(start+loc*binwidth,start+(loc+1)*binwidth)]
box_data = []
for i in box:
box_data.append(get_box_data(boxPlotter, i)[0])
ylim = ax.get_ylim()
yRange = ylim[1] - ylim[0]
lineYOffsetAxesCoord = 0.05
lineYOffsetToBoxAxesCoord = 0.06
lineHeightAxesCoord=0.02
yOffset = lineYOffsetAxesCoord*yRange
yOffsetToBox = lineYOffsetToBoxAxesCoord*yRange
ymax = np.array(box_data).max()
y = ymax + yOffsetToBox
if y>=ylist[loc]:
ylist[loc]=y
merge.rename(columns={merge.columns[0]:"prop",
merge.columns[1]: "state",
merge.columns[2]: "value"},inplace=True)
plotprop=pd.DataFrame()
for i in np.unique(merge.state):
b = merge[merge.state==i]
mean = b[b.prop=='mean'].value
mean_var=np.var(mean,ddof=1)
var= b[b.prop=='var'].value
mul = 1
for item in var:
mul *= item
plotprop.loc[i,'mean_var']=mean_var
plotprop.loc[i,'var_mul']=mul
plotprop=plotprop.rename(index={#'filter.null.Norm_CPM.mirna_and_domains.txt':'CPM',
#'filter.null.Norm_CPM_top.mirna_and_domains.txt':'CPM-top',
'filter.null.Norm_RLE.mirna_and_domains.txt':'RLE',
#'filter.null.Norm_TMM.mirna_and_domains.txt':'TMM',
'filter.null.mirna_and_domains.txt':'Raw'})
#display(plotprop)
h = lineHeightAxesCoord*yRange
#title = [i.split('.')[2] for i in matlist]
#title = ['Raw' if x == 'mirna_and_domains' else x for x in title]
#title = ['Raw','RLE','CPM','CPM-top','TMM']
title = ['Raw','RLE']
for loc in range(linenum):
lineX, lineY = [start+loc*binwidth,start+(loc+1)*binwidth], [ylist[loc]+h,ylist[loc]+h]
ax.plot(lineX, lineY,color='Black',linewidth='0.5')
ax.annotate(title[loc]+'\n'+'$b$: '+'%.2f'%(plotprop.loc[title[loc],'mean_var'])+' $w$: '+'%.2f'%(plotprop.loc[title[loc],'var_mul']),
xy=(np.mean([start+loc*binwidth,start+(loc+1)*binwidth]), ylist[loc]+h),
xytext=(0, 1), textcoords='offset points',
xycoords='data', ha='center', va='bottom', fontfamily='Arial',fontsize=5.5,
clip_on=False, annotation_clip=False)
if feature.split('|')[0]==featurename2:
print(2)
data_sub = plot.iloc[np.where(plot.feature == feature)]
data_sub = data_sub.set_index('feature').reset_index()
#colormap = pd.DataFrame(np.array([np.unique(data_sub.label),np.array(Category20c[20]).reshape(4,-1).T.ravel()[:len(np.unique(data_sub.label))].tolist()]))
#colormap = colormap.T
#colormap.columns=['label','color']
#data_sub = data_sub.merge(colormap)
data_sub = pd.merge(data_sub, colormap, how='left', on=['label'])
ylist=[100]*len(matlist)
data_sub_sub={}
merge=pd.DataFrame()
datasetprop=pd.DataFrame()
for label in np.unique(data_sub.label):
data_sub_sub[label] = data_sub.iloc[np.where(data_sub.label == label)]
#data_sub_sub[label].to_csv('./'+label+'.txt',sep='\t')
for i in np.unique(data_sub_sub[label].state):
a = data_sub_sub[label][data_sub_sub[label].state==i]
datasetprop.loc['var',i]=np.var(a['log2(count+1)'],ddof=1)
datasetprop.loc['mean',i]=np.mean(a['log2(count+1)'])
#score['mean']=np.mean(a['log2(count+1)'])
score = pd.DataFrame(datasetprop.stack()).reset_index()
score['dataset']='GSE113994'
merge = merge.append(score)
data_sub_sub[label]['state_sample_id'] = [data_sub_sub[label].state.iloc[i]+'|'+\
str(i) for i in range(len(data_sub_sub[label]))]
d = data_sub_sub[label]
d.label = [ i+ '|' + featurename2 for i in d.label]
sns.pointplot(ax=ax,x="state_sample_id", y="log2(count+1)",palette=d.color,hue=d.label,
data=d,scale=0.5,linestyles='--',markers='X')
#ax.scatter(data_sub_sub.state_sample_id.tolist(),data_sub_sub['log2(count+1)'].tolist(),color=data_sub_sub.color.tolist())
boxPlotter = sns.categorical._BoxPlotter(data=data_sub_sub[label],x='state_sample_id',y='log2(count+1)',hue=data_sub_sub[label].label,
order=None, hue_order=None,
orient=None, width=.8, color=None, palette=None, saturation=.75,
dodge=True, fliersize=5, linewidth=None)
linenum = len(matlist)
start = ax.get_xticks()[0]
binwidth = math.ceil((ax.get_xticks()[0]+ax.get_xticks()[-1])/linenum)
for loc in range(linenum):
box = [boxPlotter.group_names[i] for i in range(start+loc*binwidth,start+(loc+1)*binwidth)]
box_data = []
for i in box:
box_data.append(get_box_data(boxPlotter, i)[0])
ylim = ax.get_ylim()
yRange = ylim[1] - ylim[0]
lineYOffsetAxesCoord = 0.05
lineYOffsetToBoxAxesCoord = 0.06
lineHeightAxesCoord=0.02
yOffset = lineYOffsetAxesCoord*yRange
yOffsetToBox = lineYOffsetToBoxAxesCoord*yRange
ymin = np.array(box_data).min()
y = ymin - yOffsetToBox
if y<=ylist[loc]:
ylist[loc]=y
merge.rename(columns={merge.columns[0]:"prop",
merge.columns[1]: "state",
merge.columns[2]: "value"},inplace=True)
plotprop=pd.DataFrame()
for i in np.unique(merge.state):
b = merge[merge.state==i]
mean = b[b.prop=='mean'].value
mean_var=np.var(mean,ddof=1)
var= b[b.prop=='var'].value
mul = 1
for item in var:
mul *= item
plotprop.loc[i,'mean_var']=mean_var
plotprop.loc[i,'var_mul']=mul
plotprop=plotprop.rename(index={#'filter.null.Norm_CPM.mirna_and_domains.txt':'CPM',
#'filter.null.Norm_CPM_top.mirna_and_domains.txt':'CPM-top',
'filter.null.Norm_RLE.mirna_and_domains.txt':'RLE',
#'filter.null.Norm_TMM.mirna_and_domains.txt':'TMM',
'filter.null.mirna_and_domains.txt':'Raw'})
#display(plotprop)
h = lineHeightAxesCoord*yRange
#title = [i.split('.')[2] for i in matlist]
#title = ['Raw' if x == 'mirna_and_domains' else x for x in title]
#title = ['Raw','RLE','CPM','CPM-top','TMM']
title = ['Raw','RLE']
for loc in range(linenum):
lineX, lineY = [start+loc*binwidth,start+(loc+1)*binwidth], [ylist[loc]-h,ylist[loc]-h]
ax.plot(lineX, lineY,color='Black',linewidth='0.5')
ax.annotate(title[loc]+'\n'+'$b$: '+'%.2f'%(plotprop.loc[title[loc],'mean_var'])+' $w$: '+'%.2f'%(plotprop.loc[title[loc],'var_mul']),
xy=(np.mean([start+loc*binwidth,start+(loc+1)*binwidth]), ylist[loc]-20*h),
xytext=(0, 1), textcoords='offset points',
xycoords='data', ha='center', va='bottom', fontfamily='Arial',fontsize=5.5,
clip_on=False, annotation_clip=False)
ax.spines['bottom'].set_visible(False)
ax.set_xticks([])
#ax.legend(h,l,prop=fontlegend)
#std_plot(ax,'','',legendtitle='label',legendsort=False,title=feature.split('|')[0])
#ax.legend_.remove()
std_plot(ax,'','Normalized counts',legendtitle='label',legendsort=False,title='Expression heterogeneity of reference genes')
legend = ax.legend(prop=fontlegend,
#labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,bbox_to_anchor=(1, 0.8),framealpha=0,markerscale=2)
ax.legend_.get_frame()._linewidth=0
#ax.legend_.remove()
fig.tight_layout()
#embed_pdf_figure()
#fig.savefig(savepath+'{}_heterogeneity_noleg_2.eps'.format(feature.split('|')[0]))
#heterogeneity()
methodlist = []
for i in normlist:
for j in batchlist:
methodlist.append(i+'.'+j)
methodlist
batch_info = pd.read_table('/home/xieyufeng/fig3/data/cfRNA/batch_info.txt',index_col=0)
batch_info = pd.read_table('/home/zhaotianxiao/fig3/batch_info.txt', index_col=0)
batch_info[batch_info.dataset=='lulab_hcc']='GSE123972'
sampleclass = batch_info.iloc[:,0]
knn_list=[]
for i in tqdm(methodlist):
table = pd.read_table('/home/xieyufeng/fig3/output/'+'cfRNA'+'/matrix_processing/'+i+'.mirna_and_domains.txt',
index_col=0)
knn_list.append(get_knn_score(table,sampleclass))
knn_summary = pd.DataFrame(data={'preprocess_method':methodlist,'knn_score':list(knn_list)})
knn_summary = knn_summary.set_index('preprocess_method')
class_info = pd.read_table('/home/xieyufeng/fig3/data/cfRNA/sample_classes.txt',index_col=0)
sampleclass = class_info
uca_list=[]
for i in tqdm(methodlist):
table = pd.read_table('/home/xieyufeng/fig3/output/'+'cfRNA'+'/matrix_processing/'+i+'.mirna_and_domains.txt',
index_col=0)
uca_list.append(get_uca_score(table,sampleclass))
uca_summary = pd.DataFrame(data={'preprocess_method':methodlist,'uca_score':list(uca_list)})
uca_summary = uca_summary.set_index('preprocess_method')
get_uca_score(table,sampleclass)
from scipy.stats import pearsonr
pearsonr(uca_summary,knn_summary)
merge = pd.concat([knn_summary,uca_summary],axis=1)
merge['impute'] = [method.split('.')[1] for method in merge.index]
merge['normalization'] = [method.split('.')[2] for method in merge.index]
merge['batch'] = [method.split('.')[3] for method in merge.index]
sizelist=[10,50,200]
impute_list = np.unique(merge['impute'])
merge['imputation_size'] = merge['impute']
for i in np.arange(len(impute_list)):
where = np.where(merge['imputation_size']==impute_list[i])
for j in where:
merge['imputation_size'].iloc[j]=sizelist[i]
merge.knn_score =1-merge.knn_score
merge = merge.drop(merge.iloc[np.where(np.array([i.split('.')[-1] for i in merge.index]) == 'Batch_RUVn_1')[0]].index)