#
# This file is part of Covidor software
#
# Copyright (c) 2022 - Sequana Dev Team (https://covidor.readthedocs.io)
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# Website: https://github.com/sequana/covidor
# Documentation: http://sequana.readthedocs.io
# Contributors: https://github.com/sequana/covidor/graphs/contributors
##############################################################################
from covidor.fasta import FastA
import pandas as pd
import numpy as np
__all__ = ['Distance']
[docs]class Distance:
[docs] def compute_pdistance(self, alignment, mode="spike"):
"""Compute distance between genomes on the spike gene
This takes as input the ouput of **mafft** tool on a set of input fasta file::
cat *fasta > in.fasta
mafft --auto in.fasta > alignment.fasta
Then::
from covidor.distance import Distance
d = Distance()
d.compute_pdistance("alignment.out")
distance is computed as the number of bases that are different along 2 sequences, normalised
by the length (and multiplied by 100). Each sequence is identified by the starting and ending
20-bases long sequences.
"""
# https://www.biorxiv.org/content/10.1101/2021.12.03.471024v1.full#ref-2
# we do not obtain the same results but same trend
f = FastA(alignment)
# we need to identify the spike with starting/end of wuhan:
# first, what is the wuhan index ?
# equivalent to MN908947.3
try:
index = f.names.index("NC_045512.2")
except:
index = f.names.index("MN908947.3")
if mode == "spike":
p2 = f.sequences[index].upper().index("CAAATTACATTACACATAA")
p1 = f.sequences[index].upper().index("TGTTTGTTTTTCTTGTTTTA")
length= p2-p1+1
N = len(f)
df = pd.DataFrame(np.zeros((N,N)))
for i in range(N):
for j in range(N):
S = 0
for x,y in zip(f.sequences[i][p1:p2], f.sequences[j][p1:p2]):
if x!=y:
S+=1
S/=((p2-p1)/100)
df.iloc[i,j] = S
df.columns = f.names
df.index = f.names
self.df = df
return df
[docs] def plot(self, alignment, mode="spike"):
"""
::
d = Distance()
d.plot("alignment.out")
"""
df = self.compute_pdistance(alignment, mode=mode)
from pylab import imshow, xticks, yticks, colorbar, title
imshow(df)
colorbar()
title("Distance on spike")