Skip to content

Alignment Command-Line Bridge

This extends Bio.Align

BowtieBuildCommandline (AbstractCommandline)

Interface to Bowtie2-build indexing software

Source code in seqlike/AlignCommandline.py
class BowtieBuildCommandline(AbstractCommandline):
    """
    Interface to Bowtie2-build indexing software
    """

    def __init__(self, cmd="bowtie2-build", **kwargs):
        self.program_name = cmd
        self.parameters = [
            _Argument(["infile"], "input filename", filename=True),
            _Argument(["bt2_base"], "bt2 output basename"),
            _Switch(["-q", "--quiet", "quiet"], "print only error messages"),
        ]
        super(BowtieBuildCommandline, self).__init__(cmd, **kwargs)

BowtieCommandline (AbstractCommandline)

Interface to Bowtie2 alignment.

The following example:

cline = BowtieCommandline(
    local=True,
    threads=4,
    extends=20,
    reseeds=3,
    mismatches=1,
    length=15,
    interval='S,1,0.5',
    index='ref',
    read1='read1.fastq',
    read2='read2.fastq',
    sam='test.sam'
)
stdout,stderr = cline()

is equivalent to the command line:

bowtie2 --local -p 4 -D 20 -R 3 -N 1 -L 15 -i S,1,0.5 -x ref -1 read1.fastq -2 read2.fastq -S test.sam
Source code in seqlike/AlignCommandline.py
class BowtieCommandline(AbstractCommandline):
    """
    Interface to Bowtie2 alignment.

    The following example:

    ```python
    cline = BowtieCommandline(
        local=True,
        threads=4,
        extends=20,
        reseeds=3,
        mismatches=1,
        length=15,
        interval='S,1,0.5',
        index='ref',
        read1='read1.fastq',
        read2='read2.fastq',
        sam='test.sam'
    )
    stdout,stderr = cline()
    ```

    is equivalent to the command line:

    ```bash
    bowtie2 --local -p 4 -D 20 -R 3 -N 1 -L 15 -i S,1,0.5 -x ref -1 read1.fastq -2 read2.fastq -S test.sam
    ```
    """

    def __init__(self, cmd="bowtie2", **kwargs):
        self.program_name = cmd
        self.parameters = [
            _Switch(["--local", "local"], "perform local read alignment"),
            _Switch(["-r", "raw"], "query input files are raw one-sequence-per-line"),
            ## performance
            _Option(["-p", "threads"], "number of alignment threads to launch", equate=False),
            ## effort
            _Option(
                ["-D", "extends"],
                "give up after this many failed extends in a row",
                equate=False,
            ),
            _Option(
                ["-R", "reseeds"],
                'max # "re-seed" reads with repetitive seeds',
                equate=False,
            ),
            ## alignment
            _Option(
                ["-N", "mismatches"],
                "max # mismatches in seed alignment",
                checker_function=lambda x: x in [0, 1],
                equate=False,
            ),
            _Option(
                ["-L", "length"],
                "length of seed substring",
                checker_function=lambda x: x > 3,
                equate=False,
            ),
            _Option(
                ["-i", "interval"],
                "function for interval between seed substrings w/r/t read len",
                equate=False,
            ),
            ## main arguments
            _Option(["-x", "index"], "index filename prefix", equate=False),
            _Option(["-1", "read1"], "files with #1 mates", filename=True, equate=False),
            _Option(["-2", "read2"], "files with #2 mates", filename=True, equate=False),
            _Option(["-S", "sam"], "file for SAM output", filename=True, equate=False),
        ]
        super(BowtieCommandline, self).__init__(cmd, **kwargs)

FlashCommandline (AbstractCommandline)

Interface to FLASH alignment.

The following example:

cline = FlashCommandline(max_overlap=200, read1='read1.fastq',read2='read2.fastq') stdout,stderr = cline()

is equivalent to the command line:

$ flash read1.fastq read2.fastq -M 200

Source code in seqlike/AlignCommandline.py
class FlashCommandline(AbstractCommandline):
    """
    Interface to FLASH alignment.

    The following example:

    cline = FlashCommandline(max_overlap=200,
                read1='read1.fastq',read2='read2.fastq')
    stdout,stderr = cline()

    is equivalent to the command line:

    $ flash read1.fastq read2.fastq -M 200
    """

    def __init__(self, cmd="flash", **kwargs):
        self.program_name = cmd
        self.parameters = [
            _Argument(["read1"], "read1 input filename", filename=True),
            _Argument(["read2"], "read2 input filename", filename=True),
            _Switch(
                ["-O", "allow_outies"],
                'also try combining read pairs in the "outie" orientation',
            ),
            ## alignment
            _Option(
                ["-M", "max_overlap"],
                "max overlap length expected in 90% of read pairs",
                checker_function=lambda x: x > 0,
                equate=False,
            ),
            ## I/O, processing
            _Option(
                ["-o", "output_prefix"],
                'Prefix of output files. Default: "out".',
                equate=False,
            ),
            _Option(
                ["-d", "output_directory"],
                "Path to directory for output files. Default: current working directory.",
                equate=False,
            ),
            _Option(
                ["-t", "threads"],
                "Set the number of worker threads. This is in addition to the I/O threads.",
                equate=False,
            ),
        ]
        super(FlashCommandline, self).__init__(cmd, **kwargs)

MuscleCommandline (AbstractCommandline)

Extend Biopython's MuscleCommandline to include two additional options

\sa Bio.Align.Applications.MuscleCommandline

Source code in seqlike/AlignCommandline.py
class MuscleCommandline(AbstractCommandline):
    """
    Extend Biopython's MuscleCommandline to include two additional options

    \sa Bio.Align.Applications.MuscleCommandline
    """

    def __init__(self, cmd="muscle", **kwargs):
        ## This is taken from Applications.MuscleCommandline with some amendments
        CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
        DISTANCE_MEASURES_ITER1 = [
            "kmer6_6",
            "kmer20_3",
            "kmer20_4",
            "kbit20_3",
            "kmer4_6",
        ]
        DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [
            "pctid_kimura",
            "pctid_log",
        ]
        OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
        TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
        SEQUENCE_TYPES = ["protein", "nucleo", "auto"]
        WEIGHTING_SCHEMES = [
            "none",
            "clustalw",
            "henikoff",
            "henikoffpb",
            "gsc",
            "threeway",
        ]
        self.parameters = [
            # Can't use "in" as the final alias as this is a reserved word in python:
            _Option(["-in", "in", "input"], "Input filename", filename=True, equate=False),
            _Option(["-out", "out"], "Output filename", filename=True, equate=False),
            _Switch(["-diags", "diags"], "Find diagonals (faster for similar sequences)"),
            _Switch(["-profile", "profile"], "Perform a profile alignment"),
            _Option(
                ["-in1", "in1"],
                "First input filename for profile alignment",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-in2", "in2"],
                "Second input filename for a profile alignment",
                filename=True,
                equate=False,
            ),
            # anchorspacing   Integer              32                 Minimum spacing between
            _Option(
                ["-anchorspacing", "anchorspacing"],
                "Minimum spacing between anchor columns",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # center          Floating point       [1]                Center parameter.
            #                                                        Should be negative.
            _Option(
                ["-center", "center"],
                "Center parameter - should be negative",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # cluster1        upgma                upgmb              Clustering method.
            _Option(
                ["-cluster1", "cluster1"],
                "Clustering method used in iteration 1",
                checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
                equate=False,
            ),
            # cluster2        upgmb                                   cluster1 is used in
            #                neighborjoining                         iteration 1 and 2,
            #                                                        cluster2 in later
            #                                                        iterations.
            _Option(
                ["-cluster2", "cluster2"],
                "Clustering method used in iteration 2",
                checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
                equate=False,
            ),
            # diaglength      Integer              24                 Minimum length of
            #                                                        diagonal.
            _Option(
                ["-diaglength", "diaglength"],
                "Minimum length of diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=True,
            ),
            # diagmargin      Integer              5                  Discard this many
            #                                                        positions at ends of
            #                                                        diagonal.
            _Option(
                ["-diagmargin", "diagmargin"],
                "Discard this many positions at ends of diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # distance1       kmer6_6              Kmer6_6 (amino) or Distance measure for
            #                kmer20_3             Kmer4_6 (nucleo)   iteration 1.
            #                kmer20_4
            #                kbit20_3
            #                kmer4_6
            _Option(
                ["-distance1", "distance1"],
                "Distance measure for iteration 1",
                checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
                equate=False,
            ),
            # distance2       kmer6_6              pctid_kimura       Distance measure for
            #                kmer20_3                                iterations 2, 3 ...
            #                kmer20_4
            #                kbit20_3
            #                pctid_kimura
            #                pctid_log
            _Option(
                ["-distance2", "distance2"],
                "Distance measure for iteration 2",
                checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
                equate=False,
            ),
            # gapopen         Floating point       [1]                The gap open score.
            #                                                        Must be negative.
            _Option(
                ["-gapopen", "gapopen"],
                "Gap open score - negative number",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # hydro           Integer              5                  Window size for
            #                                                        determining whether a
            #                                                        region is hydrophobic.
            _Option(
                ["-hydro", "hydro"],
                "Window size for hydrophobic region",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # hydrofactor     Floating point       1.2                Multiplier for gap
            #                                                        open/close penalties in
            #                                                        hydrophobic regions.
            _Option(
                ["-hydrofactor", "hydrofactor"],
                "Multiplier for gap penalties in hydrophobic regions",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # log             File name            None.              Log file name (delete
            #                                                        existing file).
            _Option(["-log", "log"], "Log file name", filename=True, equate=False),
            # loga            File name            None.              Log file name (append
            #                                                        to existing file).
            _Option(
                ["-loga", "loga"],
                "Log file name (append to existing file)",
                filename=True,
                equate=False,
            ),
            # maxdiagbreak    Integer              1                  Maximum distance
            #                                                        between two diagonals
            #                                                        that allows them to
            #                                                        merge into one
            #                                                        diagonal.
            _Option(
                ["-maxdiagbreak", "maxdiagbreak"],
                "Maximum distance between two diagonals that allows " "them to merge into one diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # maxhours        Floating point       None.              Maximum time to run in
            #                                                        hours. The actual time
            #                                                        may exceed the
            #                                                        requested limit by a
            #                                                        few minutes. Decimals
            #                                                        are allowed, so 1.5
            #                                                        means one hour and 30
            #                                                        minutes.
            _Option(
                ["-maxhours", "maxhours"],
                "Maximum time to run in hours",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # maxiters        Integer 1, 2 ...     16                 Maximum number of
            #                                                        iterations.
            _Option(
                ["-maxiters", "maxiters"],
                "Maximum number of iterations",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # maxtrees        Integer              1                  Maximum number of new
            #                                                        trees to build in
            #                                                        iteration 2.
            _Option(
                ["-maxtrees", "maxtrees"],
                "Maximum number of trees to build in iteration 2",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # minbestcolscore Floating point       [1]                Minimum score a column
            #                                                        must have to be an
            #                                                        anchor.
            _Option(
                ["-minbestcolscore", "minbestcolscore"],
                "Minimum score a column must have to be an anchor",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # minsmoothscore  Floating point       [1]                Minimum smoothed score
            #                                                        a column must have to
            #                                                        be an anchor.
            _Option(
                ["-minsmoothscore", "minsmoothscore"],
                "Minimum smoothed score a column must have to " "be an anchor",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # objscore        sp                   spm                Objective score used by
            #                ps                                      tree dependent
            #                dp                                      refinement.
            #                xp                                      sp=sum-of-pairs score.
            #                spf                                     spf=sum-of-pairs score
            #                spm                                     (dimer approximation)
            #                                                        spm=sp for < 100 seqs,
            #                                                        otherwise spf
            #                                                        dp=dynamic programming
            #                                                        score.
            #                                                        ps=average profile-
            #                                                        sequence score.
            #                                                        xp=cross profile score.
            _Option(
                ["-objscore", "objscore"],
                "Objective score used by tree dependent refinement",
                checker_function=lambda x: x in OBJECTIVE_SCORES,
                equate=False,
            ),
            # root1           pseudo               pseudo             Method used to root
            _Option(
                ["-root1", "root1"],
                "Method used to root tree in iteration 1",
                checker_function=lambda x: x in TREE_ROOT_METHODS,
                equate=False,
            ),
            # root2           midlongestspan                          tree; root1 is used in
            #                minavgleafdist                          iteration 1 and 2,
            #                                                        root2 in later
            #                                                        iterations.
            _Option(
                ["-root2", "root2"],
                "Method used to root tree in iteration 2",
                checker_function=lambda x: x in TREE_ROOT_METHODS,
                equate=False,
            ),
            # seqtype         protein              auto               Sequence type.
            #                nucleo
            #                auto
            _Option(
                ["-seqtype", "seqtype"],
                "Sequence type",
                checker_function=lambda x: x in SEQUENCE_TYPES,
                equate=False,
            ),
            # smoothscoreceil Floating point       [1]                Maximum value of column
            #                                                        score for smoothing
            #                                                        purposes.
            _Option(
                ["-smoothscoreceil", "smoothscoreceil"],
                "Maximum value of column score for smoothing",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # smoothwindow    Integer              7                  Window used for anchor
            #                                                        column smoothing.
            _Option(
                ["-smoothwindow", "smoothwindow"],
                "Window used for anchor column smoothing",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # SUEFF           Floating point value 0.1                Constant used in UPGMB
            #                between 0 and 1.                        clustering. Determines
            #                                                        the relative fraction
            #                                                        of average linkage
            #                                                        (SUEFF) vs. nearest-
            #                                                        neighbor linkage (1
            #                                                        SUEFF).
            _Option(
                ["-sueff", "sueff"],
                "Constant used in UPGMB clustering",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # tree1           File name            None               Save tree produced in
            _Option(["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False),
            # tree2                                                   first or second
            #                                                        iteration to given file
            #                                                        in Newick (Phylip-
            #                                                        compatible) format.
            _Option(["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False),
            # weight1         none                 clustalw           Sequence weighting
            _Option(
                ["-weight1", "weight1"],
                "Weighting scheme used in iteration 1",
                checker_function=lambda x: x in WEIGHTING_SCHEMES,
                equate=False,
            ),
            # weight2         henikoff                                scheme.
            #                henikoffpb                              weight1 is used in
            #                gsc                                     iterations 1 and 2.
            #                clustalw                                weight2 is used for
            #                threeway                                tree-dependent
            #                                                        refinement.
            #                                                        none=all sequences have
            #                                                        equal weight.
            #                                                        henikoff=Henikoff &
            #                                                        Henikoff weighting
            #                                                        scheme.
            #                                                        henikoffpb=Modified
            #                                                        Henikoff scheme as used
            #                                                        in PSI-BLAST.
            #                                                        clustalw=CLUSTALW
            #                                                        method.
            #                                                        threeway=Gotoh three-
            #                                                        way method.
            _Option(
                ["-weight2", "weight2"],
                "Weighting scheme used in iteration 2",
                checker_function=lambda x: x in WEIGHTING_SCHEMES,
                equate=False,
            ),
            # ################### FORMATS #######################################
            # Multiple formats can be specified on the command line
            # If -msf appears it will be used regardless of other formats
            # specified. If -clw appears (and not -msf), clustalw format will be
            # used regardless of other formats specified. If both -clw and
            # -clwstrict are specified -clwstrict will be used regardless of
            # other formats specified. If -fasta is specified and not -msf,
            # -clw, or clwstrict, fasta will be used. If -fasta and -html are
            # specified -fasta will be used. Only if -html is specified alone
            # will html be used. I kid ye not.
            # clw                no              Write output in CLUSTALW format (default is
            #                                   FASTA).
            _Switch(
                ["-clw", "clw"],
                "Write output in CLUSTALW format (with a MUSCLE header)",
            ),
            # clwstrict          no              Write output in CLUSTALW format with the
            #                                   "CLUSTAL W (1.81)" header rather than the
            #                                   MUSCLE version. This is useful when a post-
            #                                   processing step is picky about the file
            #                                   header.
            _Switch(
                ["-clwstrict", "clwstrict"],
                "Write output in CLUSTALW format with version 1.81 header",
            ),
            # fasta              yes             Write output in FASTA format. Alternatives
            #                                   include clw,
            #                                   clwstrict, msf and html.
            _Switch(["-fasta", "fasta"], "Write output in FASTA format"),
            # html               no              Write output in HTML format (default is
            #                                   FASTA).
            _Switch(["-html", "html"], "Write output in HTML format"),
            # msf                no              Write output in MSF format (default is
            #                                   FASTA).
            _Switch(["-msf", "msf"], "Write output in MSF format"),
            # Phylip interleaved - undocumented as of 3.7
            _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"),
            # Phylip sequential - undocumented as of 3.7
            _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"),
            # ################# Additional specified output files #########
            _Option(
                ["-phyiout", "phyiout"],
                "Write PHYLIP interleaved output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-physout", "physout"],
                "Write PHYLIP sequential format to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-htmlout", "htmlout"],
                "Write HTML output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-clwout", "clwout"],
                "Write CLUSTALW output (with MUSCLE header) to specified " "filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-clwstrictout", "clwstrictout"],
                "Write CLUSTALW output (with version 1.81 header) to " "specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-msfout", "msfout"],
                "Write MSF format output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-fastaout", "fastaout"],
                "Write FASTA format output to specified filename",
                filename=True,
                equate=False,
            ),
            # ############# END FORMATS ###################################
            # anchors            yes             Use anchor optimization in tree dependent
            #                                   refinement iterations.
            _Switch(
                ["-anchors", "anchors"],
                "Use anchor optimisation in tree dependent " "refinement iterations",
            ),
            # noanchors          no              Disable anchor optimization. Default is
            #                                   anchors.
            _Switch(
                ["-noanchors", "noanchors"],
                "Do not use anchor optimisation in tree dependent " "refinement iterations",
            ),
            # group              yes             Group similar sequences together in the
            #                                   output. This is the default. See also
            #                                   stable.
            _Switch(["-group", "group"], "Group similar sequences in output"),
            # stable             no              Preserve input order of sequences in output
            #                                   file. Default is to group sequences by
            #                                   similarity (group).
            _Switch(
                ["-stable", "stable"],
                "Do not group similar sequences in output (not supported in v3.8)",
            ),
            # ############# log-expectation profile score ######################
            # One of either -le, -sp, or -sv
            #
            # According to the doc, spn is default and the only option for
            # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can
            # be used and produce numerically different logs (what is going on?)
            #
            # spn fails on proteins
            # le                 maybe           Use log-expectation profile score (VTML240).
            #                                    Alternatives are to use sp or sv. This is
            #                                    the default for amino acid sequences.
            _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"),
            # sv                 no              Use sum-of-pairs profile score (VTML240).
            #                                   Default is le.
            _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"),
            # sp                 no              Use sum-of-pairs protein profile score
            #                                   (PAM200). Default is le.
            _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"),
            # spn                maybe           Use sum-of-pairs nucleotide profile score
            #                                   (BLASTZ parameters). This is the only option
            #                                   for nucleotides, and is therefore the
            #                                   default.
            _Switch(["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score"),
            # ############# END log-expectation profile score ######################
            # quiet              no              Do not display progress messages.
            _Switch(["-quiet", "quiet"], "Use sum-of-pairs protein nucleotide profile score"),
            # refine             no              Input file is already aligned, skip first
            #                                   two iterations and begin tree dependent
            #                                   refinement.
            _Switch(["-refine", "refine"], "Only do tree dependent refinement"),
            # core               yes in muscle,  Do not catch exceptions.
            #                   no in muscled.
            _Switch(["-core", "core"], "Catch exceptions"),
            # nocore             no in muscle,   Catch exceptions and give an error message
            #                   yes in muscled. if possible.
            _Switch(["-nocore", "nocore"], "Do not catch exceptions"),
            # termgapsfull       no              Terminal gaps penalized with full penalty.
            #                                   [1] Not fully supported in this version.
            #
            # termgapshalf       yes             Terminal gaps penalized with half penalty.
            #                                   [1] Not fully supported in this version.
            #
            # termgapshalflonger no              Terminal gaps penalized with half penalty if
            #                                   gap relative to
            #                                   longer sequence, otherwise with full
            #                                   penalty.
            #                                   [1] Not fully supported in this version.
            # verbose            no              Write parameter settings and progress
            #                                   messages to log file.
            _Switch(["-verbose", "verbose"], "Write parameter settings and progress"),
            # version            no              Write version string to stdout and exit.
            _Switch(["-version", "version"], "Write version string to stdout and exit"),
        ]
        self.parameters += [
            _Option(
                ["-spscore", "spscore"],
                "Filename of multiple alignment for which to compute SP score",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-scorefile", "scorefile"],
                "Write alignment scores to specified filename",
                filename=True,
                equate=False,
            ),
        ]
        super(MuscleCommandline, self).__init__(cmd, **kwargs)