CMAPLE 1.0.0
C++ MAximum Parsimonious Likelihood Estimation
Loading...
Searching...
No Matches
alignment.h
1#include "../utils/timeutil.h"
2#include "sequence.h"
3
4#ifndef CMAPLE_ALIGNMENT_H
5#define CMAPLE_ALIGNMENT_H
6
7namespace cmaple {
9class Alignment {
10 public:
14 enum InputType {
17 IN_MAPLE,
21 };
22
23 // ----------------- BEGIN OF PUBLIC APIs ------------------------------------
24 // //
28
45 std::istream& aln_stream,
46 const std::string& ref_seq = "",
47 const InputType format = IN_AUTO,
49
68 const std::string& aln_filename,
69 const std::string& ref_seq = "",
70 const InputType format = IN_AUTO,
72
76
91 void read(
92 std::istream& aln_stream,
93 const std::string& ref_seq = "",
94 const InputType format = IN_AUTO,
96
113 void read(
114 const std::string& aln_filename,
115 const std::string& ref_seq = "",
116 const InputType format = IN_AUTO,
118
127 void write(std::ostream& aln_stream, const InputType& format = IN_MAPLE);
128
143 void write(const std::string& aln_filename,
144 const InputType& format = IN_MAPLE,
145 const bool overwrite = false);
146
147 // ----------------- END OF PUBLIC APIs ------------------------------------
148 // //
149
154 inline cmaple::SeqRegion::SeqType getSeqType() const {
155 return seq_type_;
156 }
157
161 inline void setSeqType(cmaple::SeqRegion::SeqType seq_type) {
162 seq_type_ = seq_type;
163 updateNumStates();
164 }
165
175 auto readRefSeq(const std::string& ref_filename,
176 const std::string& ref_name) -> std::string;
177
184 static char convertState2Char(const cmaple::StateType& state,
185 const cmaple::SeqRegion::SeqType& seqtype);
186
192 static InputType parseAlnFormat(const std::string& n_format);
193
197 std::vector<Sequence>
198 data; // note: this is inefficient, but only used briefly
199
203 std::vector<cmaple::StateType> ref_seq;
204
208 cmaple::StateType num_states;
209
213 InputType aln_format = IN_AUTO;
214
218 std::unordered_set<void*> attached_trees;
219
220 private:
225
229 void reset();
230
234 void updateNumStates();
235
241 cmaple::SeqRegion::SeqType detectSequenceType(cmaple::StrVector& sequences);
242
250 cmaple::PositionType computeSeqDistance(Sequence& sequence,
251 cmaple::RealNumType hamming_weight);
252
259 void sortSeqsByDistances();
260
267 cmaple::StateType convertChar2State(char state);
268
278 void extractMutations(const cmaple::StrVector& sequences,
279 const cmaple::StrVector& seq_names,
280 const std::string& ref_sequence);
281
290 void readMaple(std::istream& aln_stream);
291
300 void readFastaOrPhylip(std::istream& aln_stream,
301 const std::string& ref_seq = "");
302
309 void parseRefSeq(std::string& ref_sequence, bool throw_error);
310
319 void readFasta(std::istream& aln_stream,
320 cmaple::StrVector& sequences,
321 cmaple::StrVector& seq_names,
322 bool check_min_seqs = true);
323
332 void readPhylip(std::istream& aln_stream,
333 cmaple::StrVector& sequences,
334 cmaple::StrVector& seq_names,
335 bool check_min_seqs = true);
336
346 void readSequences(std::istream& aln_stream,
347 cmaple::StrVector& sequences,
348 cmaple::StrVector& seq_names,
349 InputType aln_format = IN_AUTO,
350 bool check_min_seqs = true);
351
360 std::string generateRef(cmaple::StrVector& sequences);
361
366 void processSeq(std::string& sequence,
367 std::string& line,
368 cmaple::PositionType line_num);
369
374 void addMutation(Sequence* sequence,
375 char state_char,
376 cmaple::PositionType pos,
377 cmaple::PositionType length = -1);
378
383 void writeMAPLE(std::ostream& aln_stream);
384
389 void writeFASTA(std::ostream& aln_stream);
390
395 void writePHYLIP(std::ostream& aln_stream);
396
400 auto getRefSeqStr() -> std::string;
401
405 auto getSeqString(const std::string& ref_seq_str, Sequence* sequence) -> std::string;
406
414 InputType detectMAPLEorFASTA(std::istream& aln_stream);
415
425 InputType detectInputFile(std::istream& aln_stream);
426
428};
429
438auto operator<<(std::ostream& out_stream, cmaple::Alignment& aln) -> std::ostream&;
439
442auto operator>>(std::istream& in_stream, cmaple::Alignment& aln) -> std::istream&;
443
448extern char symbols_protein[];
449extern char symbols_dna[];
450extern char symbols_rna[];
451extern char symbols_morph[];
452} // namespace cmaple
453#endif
Definition alignment.h:9
Alignment(const std::string &aln_filename, const std::string &ref_seq="", const InputType format=IN_AUTO, const cmaple::SeqRegion::SeqType seqtype=cmaple::SeqRegion::SEQ_AUTO)
Constructor from an alignment file in FASTA, PHYLIP, or MAPLE format.
void read(std::istream &aln_stream, const std::string &ref_seq="", const InputType format=IN_AUTO, const cmaple::SeqRegion::SeqType seqtype=cmaple::SeqRegion::SEQ_AUTO)
Read an alignment from a stream in FASTA, PHYLIP, or MAPLE format.
~Alignment()
Destructor.
Alignment()
Default constructor.
void read(const std::string &aln_filename, const std::string &ref_seq="", const InputType format=IN_AUTO, const cmaple::SeqRegion::SeqType seqtype=cmaple::SeqRegion::SEQ_AUTO)
Read an alignment from a file in FASTA, PHYLIP, or MAPLE format.
void write(const std::string &aln_filename, const InputType &format=IN_MAPLE, const bool overwrite=false)
Write the alignment to a file in FASTA, PHYLIP, or MAPLE format.
void write(std::ostream &aln_stream, const InputType &format=IN_MAPLE)
Write the alignment to a stream in FASTA, PHYLIP, or MAPLE format.
Alignment(std::istream &aln_stream, const std::string &ref_seq="", const InputType format=IN_AUTO, const cmaple::SeqRegion::SeqType seqtype=cmaple::SeqRegion::SEQ_AUTO)
Constructor from a stream of an alignment in FASTA, PHYLIP, or MAPLE format.
InputType
Definition alignment.h:14
@ IN_UNKNOWN
Definition alignment.h:20
@ IN_FASTA
Definition alignment.h:15
@ IN_AUTO
Definition alignment.h:19
@ IN_PHYLIP
Definition alignment.h:16
@ IN_MAPLE
Definition alignment.h:17
SeqType
Definition seqregion.h:25
@ SEQ_AUTO
Definition seqregion.h:28
std::istream & operator>>(std::istream &in_stream, cmaple::Tree &tree)
Customized >> operator to read a tree from a stream.
std::ostream & operator<<(std::ostream &out_stream, cmaple::Tree &tree)
Customized << operator to output the tree string in a (bifurcating) NEWICK format to a stream.