Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/biotite/sequence/io/clustal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
This subpackage is used for reading and writing sequence alignment objects
using the ClustalW format (``.aln``).

This package contains the :class:`ClustalFile`, which provides a
dictionary-like interface to ClustalW files, where sequence names are
keys and gapped sequence strings are the corresponding values.

Furthermore, the package contains convenience functions for
getting/setting directly :class:`Alignment` objects, rather than strings.
"""

__name__ = "biotite.sequence.io.clustal"
__author__ = "Biotite contributors"

from .convert import *
from .file import *
123 changes: 123 additions & 0 deletions src/biotite/sequence/io/clustal/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__name__ = "biotite.sequence.io.clustal"
__author__ = "Biotite contributors"

import functools
import warnings
from collections import OrderedDict
from biotite.sequence.align.alignment import Alignment
from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence

__all__ = ["get_alignment", "set_alignment"]


def get_alignment(clustal_file, seq_type=None):
"""
Get an alignment from a :class:`ClustalFile` instance.

Parameters
----------
clustal_file : ClustalFile
The :class:`ClustalFile` to be accessed.
seq_type : type[Sequence], optional
The :class:`Sequence` subclass contained in the file.
If not set, the type is automatically inferred as
:class:`ProteinSequence` or :class:`NucleotideSequence`.
For large sequence data it is recommended to set this parameter.

Returns
-------
alignment : Alignment
The alignment from the :class:`ClustalFile`.
"""
seq_strings = list(clustal_file.values())
if len(seq_strings) < 2:
raise ValueError(
f"ClustalW file contains {len(seq_strings)} sequence(s), "
f"but an alignment requires at least 2"
)
return Alignment.from_strings(
seq_strings,
functools.partial(_convert_to_sequence, seq_type=seq_type),
)


def set_alignment(clustal_file, alignment, seq_names, line_length=60):
"""
Fill a :class:`ClustalFile` with gapped sequence strings from an
alignment.

Parameters
----------
clustal_file : ClustalFile
The :class:`ClustalFile` to be accessed.
alignment : Alignment
The alignment to be set.
seq_names : iterable object of str
The names for the sequences in the alignment.
Must have the same length as the sequence count in `alignment`.
line_length : int, optional
The number of sequence characters per line in each block.
Default is 60.
"""
if line_length < 1:
raise ValueError(
f"'line_length' must be at least 1, got {line_length}"
)
gapped_seq_strings = alignment.get_gapped_sequences()
if len(gapped_seq_strings) != len(seq_names):
raise ValueError(
f"Alignment has {len(gapped_seq_strings)} sequences, "
f"but {len(seq_names)} names were given"
)
# Clear any existing entries before writing new data
clustal_file._entries.clear()
for name, seq_str in zip(seq_names, gapped_seq_strings):
clustal_file._entries[name] = seq_str
# Rebuild the text lines with the specified line length
clustal_file._rebuild_lines(line_length=line_length)


def _convert_to_sequence(seq_str, seq_type=None):
if seq_type is not None:
if seq_type == NucleotideSequence:
return _convert_to_nucleotide(seq_str)
elif seq_type == ProteinSequence:
if "U" in seq_str:
warnings.warn(
"ProteinSequence objects do not support selenocysteine "
"(U), occurrences were substituted by cysteine (C)"
)
return _convert_to_protein(seq_str)
else:
return seq_type(seq_str)

try:
return _convert_to_nucleotide(seq_str)
except AlphabetError:
pass
try:
prot_seq = _convert_to_protein(seq_str)
if "U" in seq_str:
warnings.warn(
"ProteinSequence objects do not support selenocysteine (U), "
"occurrences were substituted by cysteine (C)"
)
return prot_seq
except AlphabetError:
raise ValueError(
"ClustalW data cannot be converted either to "
"'NucleotideSequence' nor to 'ProteinSequence'"
)


def _convert_to_protein(seq_str):
return ProteinSequence(seq_str.upper().replace("U", "C").replace("O", "K"))


def _convert_to_nucleotide(seq_str):
return NucleotideSequence(seq_str.upper().replace("U", "T").replace("X", "N"))
162 changes: 162 additions & 0 deletions src/biotite/sequence/io/clustal/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__name__ = "biotite.sequence.io.clustal"
__author__ = "Biotite contributors"
__all__ = ["ClustalFile"]

from collections import OrderedDict
from collections.abc import MutableMapping
from biotite.file import InvalidFileError, TextFile


_CONSENSUS_CHARS = frozenset(" *:.")


class ClustalFile(TextFile, MutableMapping):
"""
This class represents a file in ClustalW format (``.aln``).

A ClustalW file starts with a header line beginning with ``CLUSTAL``,
followed by blocks of aligned sequences separated by blank lines.
Each sequence line contains a sequence name followed by a segment of
the gapped sequence.
An optional consensus line, containing only ``*``, ``:``, ``.`` and
space characters, may follow each block.

This class is used in a dictionary-like manner, implementing the
:class:`MutableMapping` interface:
Sequence names are used as keys,
and strings containing the full gapped sequences are the
corresponding values.

Examples
--------

>>> import os.path
>>> file = ClustalFile()
>>> file["seq1"] = "ADTRCGTARDCGTR-DRTCGRAGD"
>>> file["seq2"] = "ADTRCGT---CGTRADRTCGRAGD"
>>> print(file["seq1"])
ADTRCGTARDCGTR-DRTCGRAGD
>>> for name, seq in file.items():
... print(name, seq)
seq1 ADTRCGTARDCGTR-DRTCGRAGD
seq2 ADTRCGT---CGTRADRTCGRAGD
"""

def __init__(self):
super().__init__()
self._entries = OrderedDict()

@classmethod
def read(cls, file):
"""
Read a ClustalW file.

Parameters
----------
file : file-like object or str
The file to be read.
Alternatively a file path can be supplied.

Returns
-------
file_object : ClustalFile
The parsed file.
"""
file = super().read(file)
if len(file.lines) == 0:
raise InvalidFileError("File is empty")
file._parse()
return file

def _parse(self):
"""
Parse the lines into sequence entries.
"""
# Validate header
if not self.lines[0].startswith("CLUSTAL"):
raise InvalidFileError(
f"File does not start with 'CLUSTAL' header, "
f"got '{self.lines[0][:20]}...'"
)

self._entries = OrderedDict()
for line in self.lines[1:]:
stripped = line.strip()
# Skip blank lines
if len(stripped) == 0:
continue
# Skip consensus lines (lines that start with whitespace
# and contain only consensus characters)
if line[0] == " " and all(c in _CONSENSUS_CHARS for c in line):
continue
# Sequence lines: name followed by whitespace and sequence
# The name must start at the beginning of the line
# (no leading whitespace)
if line[0] != " ":
parts = line.split()
if len(parts) >= 2:
name = parts[0]
seq_segment = parts[1]
if name in self._entries:
self._entries[name] += seq_segment
else:
self._entries[name] = seq_segment

def __setitem__(self, name, seq_str):
if not isinstance(name, str):
raise IndexError(
"'ClustalFile' only supports sequence name strings as keys"
)
if not isinstance(seq_str, str):
raise TypeError(
"'ClustalFile' only supports sequence strings as values"
)
self._entries[name] = seq_str
self._rebuild_lines()

def __getitem__(self, name):
if not isinstance(name, str):
raise IndexError(
"'ClustalFile' only supports sequence name strings as keys"
)
return self._entries[name]

def __delitem__(self, name):
del self._entries[name]
self._rebuild_lines()

def __len__(self):
return len(self._entries)

def __iter__(self):
return self._entries.__iter__()

def __contains__(self, name):
return name in self._entries

def _rebuild_lines(self, line_length=60):
"""
Rebuild the text lines from the stored entries.
"""
self.lines = ["CLUSTAL W multiple sequence alignment", ""]

if len(self._entries) == 0:
return

names = list(self._entries.keys())
sequences = list(self._entries.values())
total_len = max(len(s) for s in sequences)
# Pad name column to accommodate the longest name + spacing
name_width = max(len(n) for n in names)

for start in range(0, total_len, line_length):
for name, seq_str in zip(names, sequences):
segment = seq_str[start : start + line_length]
if len(segment) > 0:
padding = " " * (name_width - len(name) + 6)
self.lines.append(f"{name}{padding}{segment}")
self.lines.append("")
6 changes: 6 additions & 0 deletions tests/sequence/data/clustal.aln
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CLUSTAL W (1.83) multiple sequence alignment

seq1 ADTRCGTARDCGTR-DRTCGRAGD
seq2 ADTRCGT---CGTRADRTCGRAGD
seq3 ADTRCGTARDCGTRADR--GRAGD
******* ***** ** *****
9 changes: 9 additions & 0 deletions tests/sequence/data/clustal_multi.aln
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
CLUSTAL W (1.83) multiple sequence alignment

seq1 ADTRCGTARDCGTR-DRTCGRAGD
seq2 ADTRCGT---CGTRADRTCGRAGD
*** ** *********

seq1 ADTRCGTARDCGTR-DRTCGRAGD
seq2 ADTRCGT---CGTRADRTCGRAGD
*** ** *********
Loading
Loading