SeqLike Accessor API
SeqLikeAccessor
This class extends the Pandas Series class to include a 'seq'
namespace, which exposes a number of methods that work on a series
of SeqLikes. Because of the decorator, when SeqLike is imported,
this extension is automatically is in effect. Methods are accessed
by calling df["column_name"].seq.method()
By convention, we expects that the series is comprised of SeqLike objects all of the same type. They do not need to be the same length.
Most methods will return new Pandas Series with new SeqLikes (as copies).
Source code in seqlike/SeqLikeAccessor.py
@pd.api.extensions.register_series_accessor("seq")
class SeqLikeAccessor:
"""This class extends the Pandas Series class to include a 'seq'
namespace, which exposes a number of methods that work on a series
of SeqLikes. Because of the decorator, when SeqLike is imported,
this extension is automatically is in effect. Methods are accessed
by calling `df["column_name"].seq.method()`
By convention, we expects that the series is comprised of SeqLike
objects all of the same type. They do not need to be the same
length.
Most methods will return new Pandas Series with new SeqLikes (as
copies).
"""
def __init__(self, pandas_obj):
self._validate(pandas_obj)
self._obj = self._match_alphabets(pandas_obj)
@staticmethod
def _validate(obj):
"""Static method to make sure we have a 'seqs' column and they're all
SeqLikes of the same type
:param obj: A SeqLike object.
:raises ValueError: When seqlikes do not contain the same `_type` attribute.
"""
# We assume that the 'seqs' column is all SeqLikes
is_seqlikes = obj.apply(lambda x: isinstance(x, SeqLike))
types = obj.drop_duplicates().apply(lambda x: type(x))
if not all(is_seqlikes):
raise ValueError(f"Series must contain all SeqLike objects, instead found {types}")
# All SeqLikes must be of the same type
if len(set(obj.apply(lambda s: s._type.upper()))) != 1:
raise ValueError("All SeqLikes must be of the same _type (i.e. 'aa', or 'nt').")
@staticmethod
def _match_alphabets(obj):
"""If the alphabets are not identical, set them all to the 'full' alphabet.
:note: This will clobber any custom alphabets.
:param obj: The pandas Series to apply this function to.
:returns: A modified version of the pandas Series
where each SeqLike object's alphabet has been made "full".
"""
alphabets = obj.apply(lambda s: s.alphabet)
if len(alphabets.unique()) != 1:
# TODO: we might want to convert only those SeqLikes
# that don't have the same alphabet as the original.
# Doing so could avoid a bunch of deepcopying.
warnings.warn(
"It appears that the sequences here have multiple alphabets. "
"We are replacing alphabets with the full version (AA/NT) "
"for the full collection. "
)
if obj.iloc[0]._type == "AA":
return obj.apply(lambda s: SeqLike(s, seq_type=s._type, alphabet=AA))
else:
return obj.apply(lambda s: SeqLike(s, seq_type=s._type, alphabet=NT))
return obj
def write(self, *args, **kwargs):
"""Simple wrapper on `SeqIO.write`.
:param *args: Passed into `SeqIO.write`.
:param **kwargs: Passed on to `SeqIO.write`.
"""
SeqIO.write([seq.to_seqrecord() for seq in self._obj], *args, **kwargs)
def plot(self, use_bokeh=True, colorscheme=None, x_scale=1, y_scale=1, *args, **kwargs):
"""Plot the SeqLikes as a multiple sequence alignment.
All *args and **kwargs parameters mirror
.draw_utils.draw_alignment and .draw_utils.view_alignment.
We use .as_alignment() for convenience.
:param use_bokeh: bool; if True (default), use Bokeh backend if available, otherwise use draw_alignment
:param colorscheme: ColorScheme, WebLogo based mapping of symbol to color
:param x_scale: float, x scaling factor used with draw_alignment backend
:param y_scale: float, y scaling factor used with draw_alignment backend
:param *args: Passed into `seqlike.alignment_utils.view_alignment`.
:param **kwargs: Passed into `seqlike.alignment_utils.view_alignment`.
:returns: a PIL Image object or Bokeh object.
"""
if colorscheme is None and self._type == "NT":
colorscheme = nt_simple
elif colorscheme is None and self._type == "AA":
colorscheme = aa_chemistry_simple
if use_bokeh:
try:
return view_alignment(self.as_alignment(), colorscheme=colorscheme, *args, **kwargs)
except:
pass
x = draw_alignment(self.as_alignment(), colorscheme=colorscheme, *args, **kwargs)
return x.resize(size=(int(x.size[0] * x_scale), int(x.size[1] * y_scale)))
def weblogo(
self,
seqnum_labels=None,
ref_id=None,
cols=50,
color_scheme=aa_chemistry_simple,
logo_font="ArialMT",
logo_format="png",
resolution=200,
**kwargs,
):
"""Draw weblogo from sequence alignment with optional labeling for consensus mutations
:param seqnum_labels: label the weblogo letters with these seqnums
:param ref_id: derive the weblogo letter labels using seqnums from this reference record
:param cols: weblogo column width (number of letters)
:param color_scheme: weblogo color scheme
:param logo_font: weblogo font
:param logo_format: weblogo image format ('png', 'eps', 'jpg', 'eps', etc)
:param resolution: weblogo resolution in DPI
:param **kwargs: additional weblogo arguments
:returns: PIL Image object
"""
import weblogo as wl
def highlight_consensus(labels, consensus, ref):
new_labels = list()
for label, consensus_letter, ref_letter in zip(labels, consensus, ref):
if consensus_letter != ref_letter:
label = "[%s%s]" % (ref_letter, label)
new_labels.append(label)
return new_labels
# make sequence labels
consensus = self.consensus()
if seqnum_labels:
assert len(seqnum_labels) == self.max_length(), "Number of labels does not match sequence length"
elif ref_id:
refseq = self.get_seq_by_id(ref_id)
if "seqnums" in refseq.letter_annotations:
seqnum_labels = refseq.letter_annotations["seqnums"]
# if reference sequence provided, highlight the consensus positions
seqnum_labels = highlight_consensus(seqnum_labels, consensus, refseq)
else:
seqnum_labels = range(1, len(consensus) + 1)
# set weblogo options
opts = wl.LogoOptions(
formatter=wl.formatters[logo_format],
stacks_per_line=cols,
color_scheme=color_scheme(),
logo_font=logo_font,
resolution=resolution,
**kwargs,
)
opts.rotate_numbers = True
opts.annotate = seqnum_labels
# remove gap and stop characters so that they are not included in weblogo
ignore = [gap_letter, stop_letter]
alphabet = wl.seq.Alphabet("".join(s for s in self.alphabet if s not in ignore))
counts = [count for s, count in self.as_counts_by_alphabet() if s not in ignore]
# count position-specific frequencies
data = wl.LogoData.from_counts(alphabet, np.array(counts).T)
# return logo as png image
logo = opts.formatter(data, wl.LogoFormat(data, opts))
return Image.open(io.BytesIO(logo))
def align(self, preserve_order: bool = True, *args, **kwargs):
"""Returns a Series of aligned SeqLikes from the specified column (by
default, 'seqs').
:param preserve_order: Whether or not to preserve the order of sequences.
:param *args: Not used.
:param **kwargs: Passed into the `seqlike.alignment_utils.align` function.
:returns: A pandas Series of aligned sequences.
"""
col_seq_type = self._obj.iloc[0]._type
alignment = align(self._obj, seq_type=col_seq_type, preserve_order=preserve_order, **kwargs)
return pd.Series([SeqLike(x, col_seq_type) for x in alignment], self._obj.index)
def as_alignment(self, alphabet: Optional[str] = None) -> MultipleSeqAlignment:
"""Return a `Bio.Align.MultipleSeqAlignment` of the specified columns'
SeqRecords.
Because MSAs must be the same length, we pad the ends if
needed. This allows for plotting of SeqLikes of variable
lengths.
:param alphabet: The SeqLike alphabet to use.
:returns: A `Bio.Align.MultipleSeqAlignment` object.
"""
seq_lens = self._obj.apply(len).tolist()
if len(set(seq_lens)) != 1:
max_len = max(seq_lens)
seqs = [x.pad_to(max_len) for x in self._obj]
else:
seqs = self._obj
return MultipleSeqAlignment([x.to_seqrecord(alphabet=alphabet) for x in seqs])
def as_counts(self, pad=True, dtype=float, encoder=None) -> np.ndarray:
"""Return a 2D numpy array of letter counts from the sequences or alignment.,
Here the sequence position indices (alignment columns)
are the columns of the array,
and the rows correspond to the letters.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A NumPy array.
"""
if encoder is None:
encoder = onehot_encoder_from_alphabet(self.alphabet)
return self.to_onehot(pad=pad, dtype=dtype, encoder=encoder).sum(axis=0)
def as_counts_df(self, pad=True, dtype=float, encoder=None):
"""Return DataFrame of letter counts from the sequences (or sequence alignment).
The sequence position indices (alignment columns) are the columns of the array,
and the rows correspond to the letters in the alphabet.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A pandas DataFrame
"""
return pd.DataFrame(
self.as_counts(pad=pad, dtype=dtype, encoder=encoder),
columns=list(self.alphabet),
).T
def as_counts_by_alphabet(self, pad=True, dtype=float, encoder=None):
"""
Return generator of (alphabet letter, letter counts) tuples.
This is done for each letter in alphabet,
where the letter counts are indexed by column of the sequence alignment.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A generator of (alphabet letter, count) tuples
"""
return zip(self.alphabet, self.as_counts(pad=pad, dtype=dtype, encoder=encoder).T)
@property
def alphabet(self) -> str:
"""Return the alphabet string of the all of the sequences.
:returns: An alphabet string.
"""
alphabets = self._obj.apply(lambda s: "".join(s.alphabet))
assert len(alphabets.unique()) == 1
return alphabets.iloc[0]
def _extend_ambiguous_counts(self) -> np.ndarray:
"""Distribute ambiguous letter counts among represented unambiguous letters.
:returns: a numpy 2d array of letter counts by sequence position after expanding
the ambiguous letters.
"""
if self._type == "AA":
ambiguous_values = extended_protein_values
else:
ambiguous_values = ambiguous_nt_values
# counts of letter identity by sequence position
counts = self.as_counts()
new_counts = np.zeros(counts.shape)
alphabet = self.alphabet
# expand each letter in the ambiguous alphabet
for j, letter in enumerate(alphabet):
if letter in ambiguous_values:
for unambiguous_letter in ambiguous_values[letter]:
i = alphabet.index(unambiguous_letter)
# counts matrix has shape (len(seq), len(alphabet))
new_counts[:, i] += counts[:, j]
return new_counts
def consensus(self, ignore_gap=True) -> SeqLike:
"""Return the consensus sequence as a SeqLike.
Ambiguous letter counts are distributed among represented unambiguous letters.
:param ignore_gap: Whether to ignore gaps or not. Defaults to True.
:returns: The consensus sequence as a SeqLike.
"""
if self._type == "AA":
alphabet = STANDARD_AA
else:
alphabet = STANDARD_NT
counts = self._extend_ambiguous_counts()
if ignore_gap:
# zero the gap letter counts so that gaps do not show up in consensus
counts[:, alphabet.index(gap_letter)] = 0
sequence = ""
for i in range(len(counts)):
j = np.argmax(counts[i, :])
if counts[i, j] > 0:
sequence_letter = alphabet[j]
else:
# if and only if no consensus (zero non-gap counts) at this position, use gap letter
sequence_letter = gap_letter
sequence += sequence_letter
return SeqLike(sequence, self._type)
def degenerate(self) -> SeqLike:
"""Return the ambiguous sequence representation of the sequences as a SeqLike
Following the rules adapted from
D. R. Cavener: "Comparison of the consensus sequence flanking
translational start sites in Drosophila and vertebrates."
Nucleic Acids Research 15(4): 1353-1361. (1987).
The same rules are used by TRANSFAC.
:sa: http://biopython.org/DIST/docs/api/Bio.motifs.matrix-pysrc.html
:returns: The degnerate SeqLike.
"""
if self._type == "AA":
alphabet = STANDARD_AA
ambiguous_values = extended_protein_values
generic_value = generic_protein_letter
else:
alphabet = STANDARD_NT
ambiguous_values = ambiguous_nt_values
generic_value = generic_nt_letter
# ambiguous letter indexed by sorted representative unambiguous letters
reverse_ambiguous_values = dict((v, k) for k, v in ambiguous_values.items())
counts = self._extend_ambiguous_counts()
sequence = ""
for i in range(len(counts)):
key = ""
for j, letter in enumerate(alphabet):
if counts[i, j] > 0 and letter is not gap_letter:
key += letter
# if and only if no consensus (counts) at this position, use gap letter
if len(key) == 0:
sequence += gap_letter
else:
if len(key) > 1 and "TU" not in key:
if "T" in key:
key = key.replace("T", "TU")
elif "U" in key:
key = key.replace("U", "TU")
try:
ambiguous_letter = reverse_ambiguous_values[key]
except KeyError:
ambiguous_letter = generic_value
sequence += ambiguous_letter
return SeqLike(sequence, self._type)
def __getitem__(self, index):
"""Slice sequences by dataframe row and sequence column.
:param index: A tuple specifying the dataframe row and sequence column.
:returns: The letter of a given sequence.
"""
assert isinstance(index, tuple) and len(index), "Index is row and column"
assert isinstance(index[0], (slice, int)) and isinstance(index[1], (slice, int, list))
return self._obj.iloc[index[0]].apply(lambda seq: seq[index[1]])
def max_length(self):
return int(self._obj.apply(len).max())
def get_seq_by_id(self, seq_id):
"""Get a sequence record by id.
:param seq_id: The ID to search for.
:returns: The first SeqLike object that has that particular ID.
"""
seqrow = self._obj[self._obj.apply(lambda x: x.id == seq_id)].drop_duplicates()
assert len(seqrow) == 1
return seqrow.iloc[0]
def slice_to_ref(self, ref_id, list_of_seqnums=None):
"""
Slice alignment sequences by columns corresponding to seqnums.
:param ref_id: The `id` of the sequence to use
as a reference for position numbers.
:param list_of_seqnums: A list of integers corresponding to the positions.
This is optional; if not provided we defer to `seqnums` field
in the `letter_annotations` of the SeqLike object.
:returns: A pandas Series with the sliced SeqLikes.
"""
# find the reference sequence
refseq = self.get_seq_by_id(ref_id)
# find the column indices corresponding to the seqnums
if list_of_seqnums is None:
list_of_seqnums = [
seqnum for seqnum in refseq._seqrecord.letter_annotations["seqnums"] if seqnum is not None
]
indices = refseq.seq_num_to_idx(list_of_seqnums)
# slice the alignment at column indices
return self.__getitem__((slice(None, None, None), indices))
def nt(self, codon_map=None) -> pd.Series:
"""
Return a Pandas Series of the NT form of the column of seqlikes.
:returns: A pandas Series
"""
return self._obj.apply(lambda x: x.nt(codon_map=codon_map))
def aa(self) -> pd.Series:
"""
Return a Pandas Series of the AA form of the column of seqlikes.
:returns: A pandas Series
"""
return self._obj.apply(lambda x: x.aa())
def to_onehot(self, pad=True, dtype=float, encoder=None) -> np.ndarray:
"""Return a 3d Numpy array of the specified column in onehot encoding.
The dimensions will be num_seqs x length x num_bases (5 for NT, 28 for AA)
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True.
:param dtype: The dtype of the resulting numpy array.
:param encoder: The sklearn-compatible encoder object.
Defaults to None.
:returns: A one-hot-encoded array.
"""
if pad:
max_len = self.max_length()
return np.stack(
self._obj.apply(lambda x: x.pad_to(max_len).to_onehot(dtype, encoder)).values,
axis=0,
)
else:
return np.stack(self._obj.apply(lambda x: x.to_onehot(dtype, encoder)).values, axis=0)
def to_index(self, pad: bool = True, dtype: type = int, encoder=None) -> np.ndarray:
"""Return a 2d Numpy array of the specified column in index encoding.
The dimensions will be num_seqs x length, and the values will
range from 0 to num_bases-1 (4 for NT, 27 for AAs).
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True.
:param dtype: The dtype of the resulting numpy array.
:param encoder: The sklearn-compatible encoder object.
Defaults to None.
:returns: An index-encoded array.
"""
if pad:
max_len = self.max_length()
return np.stack(
self._obj.apply(lambda x: x.pad_to(max_len).to_index(dtype, encoder)).values,
axis=0,
)
else:
return np.stack(self._obj.apply(lambda x: x.to_index(dtype, encoder)).values, axis=0)
def back_translate(self, codon_map=None):
"""Back-translate the collection of SeqLikes.
:param codon_map: A SeqLike codon map to use.
:returns: a Pandas Series of the specified column
with back translated the AAs.
Use the specified codon_map if given,
or the codon_map in each SeqLike if not.
"""
return self._obj.apply(lambda x: x.back_translate(codon_map=codon_map))
def ungap(self):
"""Return ungapped seqlikes.
Note that this may mean that this may disrupt any NT/AA correspondence.
:returns: A Pandas series of the specified column with all gaps removed.
"""
return self._obj.apply(lambda x: x.ungap())
@property
def _type(self):
"""Return a string that is the type of all SeqLikes in the 'seqs'
column.
Because this is a property, not a method, we can't pass in an optional column.
:returns: The `_type` property of the _first_ seqlike object
in the `seqs` column.
"""
return self._obj.iloc[0]._type.upper()
def __repr__(self):
return f"{self._obj.__repr__()}"
alphabet: str
property
readonly
Return the alphabet string of the all of the sequences.
:returns: An alphabet string.
__getitem__(self, index)
special
Slice sequences by dataframe row and sequence column.
:param index: A tuple specifying the dataframe row and sequence column. :returns: The letter of a given sequence.
Source code in seqlike/SeqLikeAccessor.py
def __getitem__(self, index):
"""Slice sequences by dataframe row and sequence column.
:param index: A tuple specifying the dataframe row and sequence column.
:returns: The letter of a given sequence.
"""
assert isinstance(index, tuple) and len(index), "Index is row and column"
assert isinstance(index[0], (slice, int)) and isinstance(index[1], (slice, int, list))
return self._obj.iloc[index[0]].apply(lambda seq: seq[index[1]])
aa(self)
Return a Pandas Series of the AA form of the column of seqlikes.
:returns: A pandas Series
Source code in seqlike/SeqLikeAccessor.py
def aa(self) -> pd.Series:
"""
Return a Pandas Series of the AA form of the column of seqlikes.
:returns: A pandas Series
"""
return self._obj.apply(lambda x: x.aa())
align(self, preserve_order=True, *args, **kwargs)
Returns a Series of aligned SeqLikes from the specified column (by default, 'seqs').
:param preserve_order: Whether or not to preserve the order of sequences.
:param args: Not used.
:param *kwargs: Passed into the seqlike.alignment_utils.align
function.
:returns: A pandas Series of aligned sequences.
Source code in seqlike/SeqLikeAccessor.py
def align(self, preserve_order: bool = True, *args, **kwargs):
"""Returns a Series of aligned SeqLikes from the specified column (by
default, 'seqs').
:param preserve_order: Whether or not to preserve the order of sequences.
:param *args: Not used.
:param **kwargs: Passed into the `seqlike.alignment_utils.align` function.
:returns: A pandas Series of aligned sequences.
"""
col_seq_type = self._obj.iloc[0]._type
alignment = align(self._obj, seq_type=col_seq_type, preserve_order=preserve_order, **kwargs)
return pd.Series([SeqLike(x, col_seq_type) for x in alignment], self._obj.index)
as_alignment(self, alphabet=None)
Return a Bio.Align.MultipleSeqAlignment
of the specified columns'
SeqRecords.
Because MSAs must be the same length, we pad the ends if needed. This allows for plotting of SeqLikes of variable lengths.
:param alphabet: The SeqLike alphabet to use.
:returns: A Bio.Align.MultipleSeqAlignment
object.
Source code in seqlike/SeqLikeAccessor.py
def as_alignment(self, alphabet: Optional[str] = None) -> MultipleSeqAlignment:
"""Return a `Bio.Align.MultipleSeqAlignment` of the specified columns'
SeqRecords.
Because MSAs must be the same length, we pad the ends if
needed. This allows for plotting of SeqLikes of variable
lengths.
:param alphabet: The SeqLike alphabet to use.
:returns: A `Bio.Align.MultipleSeqAlignment` object.
"""
seq_lens = self._obj.apply(len).tolist()
if len(set(seq_lens)) != 1:
max_len = max(seq_lens)
seqs = [x.pad_to(max_len) for x in self._obj]
else:
seqs = self._obj
return MultipleSeqAlignment([x.to_seqrecord(alphabet=alphabet) for x in seqs])
as_counts(self, pad=True, dtype=<class 'float'>, encoder=None)
Return a 2D numpy array of letter counts from the sequences or alignment.,
Here the sequence position indices (alignment columns) are the columns of the array, and the rows correspond to the letters.
:param pad: Whether or not to pad sequence. :param dtype: numpy dtype :param encoder: The one-hot encoder to use. :returns: A NumPy array.
Source code in seqlike/SeqLikeAccessor.py
def as_counts(self, pad=True, dtype=float, encoder=None) -> np.ndarray:
"""Return a 2D numpy array of letter counts from the sequences or alignment.,
Here the sequence position indices (alignment columns)
are the columns of the array,
and the rows correspond to the letters.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A NumPy array.
"""
if encoder is None:
encoder = onehot_encoder_from_alphabet(self.alphabet)
return self.to_onehot(pad=pad, dtype=dtype, encoder=encoder).sum(axis=0)
as_counts_by_alphabet(self, pad=True, dtype=<class 'float'>, encoder=None)
Return generator of (alphabet letter, letter counts) tuples.
This is done for each letter in alphabet, where the letter counts are indexed by column of the sequence alignment.
:param pad: Whether or not to pad sequence. :param dtype: numpy dtype :param encoder: The one-hot encoder to use. :returns: A generator of (alphabet letter, count) tuples
Source code in seqlike/SeqLikeAccessor.py
def as_counts_by_alphabet(self, pad=True, dtype=float, encoder=None):
"""
Return generator of (alphabet letter, letter counts) tuples.
This is done for each letter in alphabet,
where the letter counts are indexed by column of the sequence alignment.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A generator of (alphabet letter, count) tuples
"""
return zip(self.alphabet, self.as_counts(pad=pad, dtype=dtype, encoder=encoder).T)
as_counts_df(self, pad=True, dtype=<class 'float'>, encoder=None)
Return DataFrame of letter counts from the sequences (or sequence alignment).
The sequence position indices (alignment columns) are the columns of the array, and the rows correspond to the letters in the alphabet.
:param pad: Whether or not to pad sequence. :param dtype: numpy dtype :param encoder: The one-hot encoder to use. :returns: A pandas DataFrame
Source code in seqlike/SeqLikeAccessor.py
def as_counts_df(self, pad=True, dtype=float, encoder=None):
"""Return DataFrame of letter counts from the sequences (or sequence alignment).
The sequence position indices (alignment columns) are the columns of the array,
and the rows correspond to the letters in the alphabet.
:param pad: Whether or not to pad sequence.
:param dtype: numpy dtype
:param encoder: The one-hot encoder to use.
:returns: A pandas DataFrame
"""
return pd.DataFrame(
self.as_counts(pad=pad, dtype=dtype, encoder=encoder),
columns=list(self.alphabet),
).T
back_translate(self, codon_map=None)
Back-translate the collection of SeqLikes.
:param codon_map: A SeqLike codon map to use. :returns: a Pandas Series of the specified column with back translated the AAs. Use the specified codon_map if given, or the codon_map in each SeqLike if not.
Source code in seqlike/SeqLikeAccessor.py
def back_translate(self, codon_map=None):
"""Back-translate the collection of SeqLikes.
:param codon_map: A SeqLike codon map to use.
:returns: a Pandas Series of the specified column
with back translated the AAs.
Use the specified codon_map if given,
or the codon_map in each SeqLike if not.
"""
return self._obj.apply(lambda x: x.back_translate(codon_map=codon_map))
consensus(self, ignore_gap=True)
Return the consensus sequence as a SeqLike.
Ambiguous letter counts are distributed among represented unambiguous letters.
:param ignore_gap: Whether to ignore gaps or not. Defaults to True. :returns: The consensus sequence as a SeqLike.
Source code in seqlike/SeqLikeAccessor.py
def consensus(self, ignore_gap=True) -> SeqLike:
"""Return the consensus sequence as a SeqLike.
Ambiguous letter counts are distributed among represented unambiguous letters.
:param ignore_gap: Whether to ignore gaps or not. Defaults to True.
:returns: The consensus sequence as a SeqLike.
"""
if self._type == "AA":
alphabet = STANDARD_AA
else:
alphabet = STANDARD_NT
counts = self._extend_ambiguous_counts()
if ignore_gap:
# zero the gap letter counts so that gaps do not show up in consensus
counts[:, alphabet.index(gap_letter)] = 0
sequence = ""
for i in range(len(counts)):
j = np.argmax(counts[i, :])
if counts[i, j] > 0:
sequence_letter = alphabet[j]
else:
# if and only if no consensus (zero non-gap counts) at this position, use gap letter
sequence_letter = gap_letter
sequence += sequence_letter
return SeqLike(sequence, self._type)
degenerate(self)
Return the ambiguous sequence representation of the sequences as a SeqLike
Following the rules adapted from D. R. Cavener: "Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates." Nucleic Acids Research 15(4): 1353-1361. (1987). The same rules are used by TRANSFAC.
:sa: http://biopython.org/DIST/docs/api/Bio.motifs.matrix-pysrc.html
:returns: The degnerate SeqLike.
Source code in seqlike/SeqLikeAccessor.py
def degenerate(self) -> SeqLike:
"""Return the ambiguous sequence representation of the sequences as a SeqLike
Following the rules adapted from
D. R. Cavener: "Comparison of the consensus sequence flanking
translational start sites in Drosophila and vertebrates."
Nucleic Acids Research 15(4): 1353-1361. (1987).
The same rules are used by TRANSFAC.
:sa: http://biopython.org/DIST/docs/api/Bio.motifs.matrix-pysrc.html
:returns: The degnerate SeqLike.
"""
if self._type == "AA":
alphabet = STANDARD_AA
ambiguous_values = extended_protein_values
generic_value = generic_protein_letter
else:
alphabet = STANDARD_NT
ambiguous_values = ambiguous_nt_values
generic_value = generic_nt_letter
# ambiguous letter indexed by sorted representative unambiguous letters
reverse_ambiguous_values = dict((v, k) for k, v in ambiguous_values.items())
counts = self._extend_ambiguous_counts()
sequence = ""
for i in range(len(counts)):
key = ""
for j, letter in enumerate(alphabet):
if counts[i, j] > 0 and letter is not gap_letter:
key += letter
# if and only if no consensus (counts) at this position, use gap letter
if len(key) == 0:
sequence += gap_letter
else:
if len(key) > 1 and "TU" not in key:
if "T" in key:
key = key.replace("T", "TU")
elif "U" in key:
key = key.replace("U", "TU")
try:
ambiguous_letter = reverse_ambiguous_values[key]
except KeyError:
ambiguous_letter = generic_value
sequence += ambiguous_letter
return SeqLike(sequence, self._type)
get_seq_by_id(self, seq_id)
Get a sequence record by id.
:param seq_id: The ID to search for. :returns: The first SeqLike object that has that particular ID.
Source code in seqlike/SeqLikeAccessor.py
def get_seq_by_id(self, seq_id):
"""Get a sequence record by id.
:param seq_id: The ID to search for.
:returns: The first SeqLike object that has that particular ID.
"""
seqrow = self._obj[self._obj.apply(lambda x: x.id == seq_id)].drop_duplicates()
assert len(seqrow) == 1
return seqrow.iloc[0]
nt(self, codon_map=None)
Return a Pandas Series of the NT form of the column of seqlikes.
:returns: A pandas Series
Source code in seqlike/SeqLikeAccessor.py
def nt(self, codon_map=None) -> pd.Series:
"""
Return a Pandas Series of the NT form of the column of seqlikes.
:returns: A pandas Series
"""
return self._obj.apply(lambda x: x.nt(codon_map=codon_map))
plot(self, use_bokeh=True, colorscheme=None, x_scale=1, y_scale=1, *args, **kwargs)
Plot the SeqLikes as a multiple sequence alignment.
All args and *kwargs parameters mirror .draw_utils.draw_alignment and .draw_utils.view_alignment. We use .as_alignment() for convenience.
:param use_bokeh: bool; if True (default), use Bokeh backend if available, otherwise use draw_alignment
:param colorscheme: ColorScheme, WebLogo based mapping of symbol to color
:param x_scale: float, x scaling factor used with draw_alignment backend
:param y_scale: float, y scaling factor used with draw_alignment backend
:param args: Passed into seqlike.alignment_utils.view_alignment
.
:param *kwargs: Passed into seqlike.alignment_utils.view_alignment
.
:returns: a PIL Image object or Bokeh object.
Source code in seqlike/SeqLikeAccessor.py
def plot(self, use_bokeh=True, colorscheme=None, x_scale=1, y_scale=1, *args, **kwargs):
"""Plot the SeqLikes as a multiple sequence alignment.
All *args and **kwargs parameters mirror
.draw_utils.draw_alignment and .draw_utils.view_alignment.
We use .as_alignment() for convenience.
:param use_bokeh: bool; if True (default), use Bokeh backend if available, otherwise use draw_alignment
:param colorscheme: ColorScheme, WebLogo based mapping of symbol to color
:param x_scale: float, x scaling factor used with draw_alignment backend
:param y_scale: float, y scaling factor used with draw_alignment backend
:param *args: Passed into `seqlike.alignment_utils.view_alignment`.
:param **kwargs: Passed into `seqlike.alignment_utils.view_alignment`.
:returns: a PIL Image object or Bokeh object.
"""
if colorscheme is None and self._type == "NT":
colorscheme = nt_simple
elif colorscheme is None and self._type == "AA":
colorscheme = aa_chemistry_simple
if use_bokeh:
try:
return view_alignment(self.as_alignment(), colorscheme=colorscheme, *args, **kwargs)
except:
pass
x = draw_alignment(self.as_alignment(), colorscheme=colorscheme, *args, **kwargs)
return x.resize(size=(int(x.size[0] * x_scale), int(x.size[1] * y_scale)))
slice_to_ref(self, ref_id, list_of_seqnums=None)
Slice alignment sequences by columns corresponding to seqnums.
:param ref_id: The id
of the sequence to use
as a reference for position numbers.
:param list_of_seqnums: A list of integers corresponding to the positions.
This is optional; if not provided we defer to seqnums
field
in the letter_annotations
of the SeqLike object.
:returns: A pandas Series with the sliced SeqLikes.
Source code in seqlike/SeqLikeAccessor.py
def slice_to_ref(self, ref_id, list_of_seqnums=None):
"""
Slice alignment sequences by columns corresponding to seqnums.
:param ref_id: The `id` of the sequence to use
as a reference for position numbers.
:param list_of_seqnums: A list of integers corresponding to the positions.
This is optional; if not provided we defer to `seqnums` field
in the `letter_annotations` of the SeqLike object.
:returns: A pandas Series with the sliced SeqLikes.
"""
# find the reference sequence
refseq = self.get_seq_by_id(ref_id)
# find the column indices corresponding to the seqnums
if list_of_seqnums is None:
list_of_seqnums = [
seqnum for seqnum in refseq._seqrecord.letter_annotations["seqnums"] if seqnum is not None
]
indices = refseq.seq_num_to_idx(list_of_seqnums)
# slice the alignment at column indices
return self.__getitem__((slice(None, None, None), indices))
to_index(self, pad=True, dtype=<class 'int'>, encoder=None)
Return a 2d Numpy array of the specified column in index encoding.
The dimensions will be num_seqs x length, and the values will range from 0 to num_bases-1 (4 for NT, 27 for AAs).
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True. :param dtype: The dtype of the resulting numpy array. :param encoder: The sklearn-compatible encoder object. Defaults to None. :returns: An index-encoded array.
Source code in seqlike/SeqLikeAccessor.py
def to_index(self, pad: bool = True, dtype: type = int, encoder=None) -> np.ndarray:
"""Return a 2d Numpy array of the specified column in index encoding.
The dimensions will be num_seqs x length, and the values will
range from 0 to num_bases-1 (4 for NT, 27 for AAs).
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True.
:param dtype: The dtype of the resulting numpy array.
:param encoder: The sklearn-compatible encoder object.
Defaults to None.
:returns: An index-encoded array.
"""
if pad:
max_len = self.max_length()
return np.stack(
self._obj.apply(lambda x: x.pad_to(max_len).to_index(dtype, encoder)).values,
axis=0,
)
else:
return np.stack(self._obj.apply(lambda x: x.to_index(dtype, encoder)).values, axis=0)
to_onehot(self, pad=True, dtype=<class 'float'>, encoder=None)
Return a 3d Numpy array of the specified column in onehot encoding.
The dimensions will be num_seqs x length x num_bases (5 for NT, 28 for AA)
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True. :param dtype: The dtype of the resulting numpy array. :param encoder: The sklearn-compatible encoder object. Defaults to None. :returns: A one-hot-encoded array.
Source code in seqlike/SeqLikeAccessor.py
def to_onehot(self, pad=True, dtype=float, encoder=None) -> np.ndarray:
"""Return a 3d Numpy array of the specified column in onehot encoding.
The dimensions will be num_seqs x length x num_bases (5 for NT, 28 for AA)
We pad if needed, because the numpy arrays must be the same size.
:param pad: Whether or not to pad characters. Defaults to True.
:param dtype: The dtype of the resulting numpy array.
:param encoder: The sklearn-compatible encoder object.
Defaults to None.
:returns: A one-hot-encoded array.
"""
if pad:
max_len = self.max_length()
return np.stack(
self._obj.apply(lambda x: x.pad_to(max_len).to_onehot(dtype, encoder)).values,
axis=0,
)
else:
return np.stack(self._obj.apply(lambda x: x.to_onehot(dtype, encoder)).values, axis=0)
ungap(self)
Return ungapped seqlikes.
Note that this may mean that this may disrupt any NT/AA correspondence.
:returns: A Pandas series of the specified column with all gaps removed.
Source code in seqlike/SeqLikeAccessor.py
def ungap(self):
"""Return ungapped seqlikes.
Note that this may mean that this may disrupt any NT/AA correspondence.
:returns: A Pandas series of the specified column with all gaps removed.
"""
return self._obj.apply(lambda x: x.ungap())
weblogo(self, seqnum_labels=None, ref_id=None, cols=50, color_scheme=<function aa_chemistry_simple at 0x7f6b134488b0>, logo_font='ArialMT', logo_format='png', resolution=200, **kwargs)
Draw weblogo from sequence alignment with optional labeling for consensus mutations :param seqnum_labels: label the weblogo letters with these seqnums :param ref_id: derive the weblogo letter labels using seqnums from this reference record :param cols: weblogo column width (number of letters) :param color_scheme: weblogo color scheme :param logo_font: weblogo font :param logo_format: weblogo image format ('png', 'eps', 'jpg', 'eps', etc) :param resolution: weblogo resolution in DPI :param **kwargs: additional weblogo arguments :returns: PIL Image object
Source code in seqlike/SeqLikeAccessor.py
def weblogo(
self,
seqnum_labels=None,
ref_id=None,
cols=50,
color_scheme=aa_chemistry_simple,
logo_font="ArialMT",
logo_format="png",
resolution=200,
**kwargs,
):
"""Draw weblogo from sequence alignment with optional labeling for consensus mutations
:param seqnum_labels: label the weblogo letters with these seqnums
:param ref_id: derive the weblogo letter labels using seqnums from this reference record
:param cols: weblogo column width (number of letters)
:param color_scheme: weblogo color scheme
:param logo_font: weblogo font
:param logo_format: weblogo image format ('png', 'eps', 'jpg', 'eps', etc)
:param resolution: weblogo resolution in DPI
:param **kwargs: additional weblogo arguments
:returns: PIL Image object
"""
import weblogo as wl
def highlight_consensus(labels, consensus, ref):
new_labels = list()
for label, consensus_letter, ref_letter in zip(labels, consensus, ref):
if consensus_letter != ref_letter:
label = "[%s%s]" % (ref_letter, label)
new_labels.append(label)
return new_labels
# make sequence labels
consensus = self.consensus()
if seqnum_labels:
assert len(seqnum_labels) == self.max_length(), "Number of labels does not match sequence length"
elif ref_id:
refseq = self.get_seq_by_id(ref_id)
if "seqnums" in refseq.letter_annotations:
seqnum_labels = refseq.letter_annotations["seqnums"]
# if reference sequence provided, highlight the consensus positions
seqnum_labels = highlight_consensus(seqnum_labels, consensus, refseq)
else:
seqnum_labels = range(1, len(consensus) + 1)
# set weblogo options
opts = wl.LogoOptions(
formatter=wl.formatters[logo_format],
stacks_per_line=cols,
color_scheme=color_scheme(),
logo_font=logo_font,
resolution=resolution,
**kwargs,
)
opts.rotate_numbers = True
opts.annotate = seqnum_labels
# remove gap and stop characters so that they are not included in weblogo
ignore = [gap_letter, stop_letter]
alphabet = wl.seq.Alphabet("".join(s for s in self.alphabet if s not in ignore))
counts = [count for s, count in self.as_counts_by_alphabet() if s not in ignore]
# count position-specific frequencies
data = wl.LogoData.from_counts(alphabet, np.array(counts).T)
# return logo as png image
logo = opts.formatter(data, wl.LogoFormat(data, opts))
return Image.open(io.BytesIO(logo))
write(self, *args, **kwargs)
Simple wrapper on SeqIO.write
.
:param args: Passed into SeqIO.write
.
:param *kwargs: Passed on to SeqIO.write
.
Source code in seqlike/SeqLikeAccessor.py
def write(self, *args, **kwargs):
"""Simple wrapper on `SeqIO.write`.
:param *args: Passed into `SeqIO.write`.
:param **kwargs: Passed on to `SeqIO.write`.
"""
SeqIO.write([seq.to_seqrecord() for seq in self._obj], *args, **kwargs)