Source code for covidor.distance

#
#  This file is part of Covidor software
#
#  Copyright (c) 2022 - Sequana Dev Team (https://covidor.readthedocs.io)
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  Website:       https://github.com/sequana/covidor
#  Documentation: http://sequana.readthedocs.io
#  Contributors:  https://github.com/sequana/covidor/graphs/contributors
##############################################################################
from covidor.fasta import FastA
import pandas as pd
import numpy as np


__all__ = ['Distance']


[docs]class Distance:

[docs]    def compute_pdistance(self, alignment, mode="spike"):
        """Compute distance between genomes on the spike gene

        This takes as input the ouput of **mafft** tool on a set of input fasta file::

            cat *fasta > in.fasta
            mafft --auto in.fasta > alignment.fasta

        Then::

            from covidor.distance import Distance
            d = Distance()
            d.compute_pdistance("alignment.out")
            

        distance is computed as the number of bases that are different along 2 sequences, normalised 
        by the length (and multiplied by 100). Each sequence is identified by the starting and ending
        20-bases long sequences. 
        
        

        """
        # https://www.biorxiv.org/content/10.1101/2021.12.03.471024v1.full#ref-2
        # we do not obtain the same results but same trend

        f = FastA(alignment)

        # we need to identify the spike with starting/end of wuhan:
        # first, what is the wuhan index ?
        # equivalent to MN908947.3
        try:
            index = f.names.index("NC_045512.2")
        except:
            index = f.names.index("MN908947.3")

        if mode == "spike":
            p2 = f.sequences[index].upper().index("CAAATTACATTACACATAA")
            p1 = f.sequences[index].upper().index("TGTTTGTTTTTCTTGTTTTA")
            length= p2-p1+1

        N = len(f)
        df = pd.DataFrame(np.zeros((N,N)))

        for i in range(N):
            for j in range(N):
                S = 0
                for x,y in zip(f.sequences[i][p1:p2], f.sequences[j][p1:p2]):
                    if x!=y: 
                        S+=1
                S/=((p2-p1)/100)
                df.iloc[i,j] = S
        df.columns = f.names
        df.index = f.names

        self.df = df 
        return df

[docs]    def plot(self, alignment, mode="spike"):
        """

        ::

            d = Distance()
            d.plot("alignment.out")

        """
        df = self.compute_pdistance(alignment, mode=mode)
        from pylab import imshow, xticks, yticks, colorbar, title
        imshow(df)
        colorbar()
        title("Distance on spike")