Source code for covidor.distance

#
#  This file is part of Covidor software
#
#  Copyright (c) 2022 - Sequana Dev Team (https://covidor.readthedocs.io)
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  Website:       https://github.com/sequana/covidor
#  Documentation: http://sequana.readthedocs.io
#  Contributors:  https://github.com/sequana/covidor/graphs/contributors
##############################################################################
from covidor.fasta import FastA
import pandas as pd
import numpy as np


__all__ = ['Distance']


[docs]class Distance:
[docs] def compute_pdistance(self, alignment, mode="spike"): """Compute distance between genomes on the spike gene This takes as input the ouput of **mafft** tool on a set of input fasta file:: cat *fasta > in.fasta mafft --auto in.fasta > alignment.fasta Then:: from covidor.distance import Distance d = Distance() d.compute_pdistance("alignment.out") distance is computed as the number of bases that are different along 2 sequences, normalised by the length (and multiplied by 100). Each sequence is identified by the starting and ending 20-bases long sequences. """ # https://www.biorxiv.org/content/10.1101/2021.12.03.471024v1.full#ref-2 # we do not obtain the same results but same trend f = FastA(alignment) # we need to identify the spike with starting/end of wuhan: # first, what is the wuhan index ? # equivalent to MN908947.3 try: index = f.names.index("NC_045512.2") except: index = f.names.index("MN908947.3") if mode == "spike": p2 = f.sequences[index].upper().index("CAAATTACATTACACATAA") p1 = f.sequences[index].upper().index("TGTTTGTTTTTCTTGTTTTA") length= p2-p1+1 N = len(f) df = pd.DataFrame(np.zeros((N,N))) for i in range(N): for j in range(N): S = 0 for x,y in zip(f.sequences[i][p1:p2], f.sequences[j][p1:p2]): if x!=y: S+=1 S/=((p2-p1)/100) df.iloc[i,j] = S df.columns = f.names df.index = f.names self.df = df return df
[docs] def plot(self, alignment, mode="spike"): """ :: d = Distance() d.plot("alignment.out") """ df = self.compute_pdistance(alignment, mode=mode) from pylab import imshow, xticks, yticks, colorbar, title imshow(df) colorbar() title("Distance on spike")