CMAPLE 1.0.0
C++ MAximum Parsimonious Likelihood Estimation
Loading...
Searching...
No Matches
tools.h
1/****************************************************************************
2 * Copyright (C) 2022 by
3 * Nhan Ly-Trong <trongnhan.uit@gmail.com>
4 * Chris Bielow <chris.bielow@fu-berlin.de>
5 * Nicola De Maio <demaio@ebi.ac.uk>
6 * BUI Quang Minh <m.bui@anu.edu.au>
7 *
8 *
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the
22 * Free Software Foundation, Inc.,
23 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
24 ***************************************************************************/
25
26#pragma once
27
28#include <cmaple_config.h>
29#include <sys/stat.h>
30#include <algorithm>
31#include <cfloat>
32#include <cmath>
33#include <cstdint>
34#include <cstdio>
35#include <cstdlib>
36#include <cstring>
37#include <fstream>
38#include <iomanip>
39#include <iostream>
40#include <map>
41#include <random>
42#include <regex>
43#include <set>
44#include <sstream>
45#include <stack>
46#include <string>
47#include <thread>
48#include <vector>
49#include <cassert>
50#include "operatingsystem.h"
51#ifdef _OPENMP
52#include <omp.h> /* OpenMP */
53#endif
54
55#ifdef NDEBUG
56#define ASSERT(EXPRESSION) ((void)0)
57#else
58#if defined(__GNUC__) || defined(__clang__)
59#define ASSERT(EXPRESSION) \
60 ((EXPRESSION) ? (void)0 \
61 : cmaple::_my_assert(#EXPRESSION, __PRETTY_FUNCTION__, \
62 __FILE__, __LINE__))
63#else
64#define ASSERT(EXPRESSION) \
65 ((EXPRESSION) \
66 ? (void)0 \
67 : cmaple::_my_assert(#EXPRESSION, __func__, __FILE__, __LINE__))
68#endif
69#endif
70
71#define USE_HASH_MAP
72
73#if defined(__GNUC__) && !defined(GCC_VERSION)
74#define GCC_VERSION \
75 (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
76// #else
77// #define GCC_VERSION 0
78#endif
79
80// for MSVC
81#ifndef __func__
82#define __func__ __FUNCTION__
83#endif
84
85#if defined(USE_HASH_MAP)
86// #include <unordered_map>
87// #include <unordered_set>
88
89#if defined(_MSC_VER)
90#include <unordered_map>
91#include <unordered_set>
92#elif defined(__clang__)
93// libc++ detected: _LIBCPP_VERSION
94// libstdc++ detected: __GLIBCXX__
95#if __has_include(<unordered_map>) // defines _LIBCPP_VERSION
96#include <unordered_map>
97#include <unordered_set>
98#else
99#include <tr1/unordered_map>
100#include <tr1/unordered_set>
101using namespace std::tr1;
102#endif
103#elif !defined(__GNUC__)
104#include <hash_map>
105#include <hash_set>
106using namespace stdext;
107#elif GCC_VERSION < 40300
108#include <ext/hash_map>
109#include <ext/hash_set>
110using namespace __gnu_cxx;
111#define unordered_map hash_map
112#define unordered_set hash_set
113/*
114NHANLT: resolve ambiguous reference
115std::tr1 is Technical Report 1, a proposal of extensions for C++0X when waiting
116for C++11 to get approved. Most of features (including
117std::tr1::unordered_map/set) in Technical Report 1 had been merged into C++11.
118C++11 had been fully implemented since GCC 4.8.1 -> we only need to use std::tr1
119in GCC older than 4.8.1
120*/
121#elif GCC_VERSION < 40800
122#include <tr1/unordered_map>
123#include <tr1/unordered_set>
124using namespace std::tr1;
125#else
126#include <unordered_map>
127#include <unordered_set>
128#endif
129
130#else
131#include <map>
132#include <set>
133#endif
134
135#if defined(USE_HASH_MAP) && GCC_VERSION < 40300 && !defined(_MSC_VER) && \
136 !defined(__clang__)
137/*
138 Define the hash function of Split
139 */
140#if !defined(__GNUC__)
141namespace stdext
142#else
143namespace __gnu_cxx
144#endif
145{
146template <>
147struct hash<string> {
148 size_t operator()(string str) const {
149 hash<const char*> hash_str;
150 return hash_str(str.c_str());
151 }
152};
153} // namespace stdext
154#endif // USE_HASH_MAP
155
156namespace cmaple {
157// redefine assertion
158inline void _my_assert(const char* expression,
159 const char* func,
160 const char* file,
161 int line) {
162 char* sfile = const_cast<char*>(strrchr(file, '/'));
163 if (!sfile)
164 sfile = const_cast<char*>(file);
165 else
166 sfile++;
167 std::cerr << sfile << ":" << line << ": " << func << ": Assertion `"
168 << expression << "' failed." << std::endl;
169 abort();
170}
171
175typedef uint16_t StateType;
176
180typedef int32_t PositionType;
181
185typedef uint32_t NumSeqsType;
186
190typedef int16_t LengthType;
194typedef int32_t LengthTypeLarge;
195
199typedef double RealNumType;
200
204typedef std::vector<RealNumType> RealNumberVector;
205
209typedef std::vector<int> IntList;
210
214typedef std::vector<int> IntVector;
215
219typedef std::vector<bool> BoolVector;
220
224typedef std::vector<char> CharVector;
225
229typedef std::vector<std::string> StrVector;
230
234typedef unsigned int UINT;
235
239enum RegionType : StateType {
240 TYPE_R = 250,
241 TYPE_O,
242 TYPE_N,
243 TYPE_DEL,
244 TYPE_INVALID
245};
246
250enum VerboseMode { VB_QUIET, VB_MIN, VB_MED, VB_MAX, VB_DEBUG };
251
255extern VerboseMode verbose_mode;
256
260enum MiniIndex : NumSeqsType {
261 TOP,
262 LEFT,
263 RIGHT,
264 UNDEFINED // to handle case return node = null
265};
266
267/*--------------------------- NODE's INDEX -----------------------------------*/
273struct Index {
278 Index() : vector_index_(0), mini_index_(UNDEFINED){};
279
283 Index(NumSeqsType vec_index, MiniIndex mini_index)
284 : vector_index_(vec_index), mini_index_(mini_index){};
285
289 MiniIndex getMiniIndex() const { return mini_index_; }
290
295 MiniIndex getFlipMiniIndex() const { return MiniIndex(3 - mini_index_); }
296
300 void setMiniIndex(MiniIndex mini_index) { mini_index_ = mini_index; }
301
305 NumSeqsType getVectorIndex() const { return vector_index_; }
306
310 void setVectorIndex(NumSeqsType vec_index) { vector_index_ = vec_index; }
311
315 bool operator==(const Index& rhs) const {
316 return (vector_index_ == rhs.getVectorIndex() &&
317 mini_index_ == rhs.getMiniIndex());
318 }
319
320 private:
321 NumSeqsType vector_index_ : 30;
322 MiniIndex mini_index_ : 2;
323};
325/*--------------------------- NODE's INDEX -----------------------------------*/
326
327/*--------------------------------------------------------------*/
328/*--------------------------------------------------------------*/
329const char REF_NAME[] = "REF";
330const int MIN_NUM_TAXA = 3;
331const RealNumType MIN_NEGATIVE =
332 std::numeric_limits<RealNumType>::lowest(); // -FLT_MAX;
333const RealNumType MIN_POSITIVE =
334 (std::numeric_limits<RealNumType>::min)(); // FLT_MIN;
335// const RealNumType INVERSE_MIN_POSITIVE = 1.0 / MIN_POSITIVE;
336// const RealNumType LOG_MIN_POSITIVE = log(MIN_POSITIVE);
337const RealNumType MAX_POSITIVE = (std::numeric_limits<RealNumType>::max)();
338const RealNumType LOG_MAX_POSITIVE = log(MAX_POSITIVE);
339
340template <class T>
341constexpr T getMinCarryOver();
342template <>
343constexpr double getMinCarryOver<double>() {
344 return 1e-250;
345}; // of -308
346template <>
347constexpr float getMinCarryOver<float>() {
348 return 1e-36f;
349}; // of -38
350
351const RealNumType MIN_CARRY_OVER = getMinCarryOver<RealNumType>();
352const RealNumType MEAN_SUBS_PER_SITE = 0.02;
353const RealNumType MAX_SUBS_PER_SITE = 0.067;
354
355/*--------------------------------------------------------------*/
356/*--------------------------------------------------------------*/
360class Params {
361 private:
365 Params();
366
367 // declare ParamsBuilder as a friend class to allow it access to the default
368 // contructor Params();
369 friend class ParamsBuilder;
370
371 public:
375 std::string aln_path;
376
380 std::string ref_path;
381
385 std::string ref_seqname;
386
390 std::string aln_format_str;
391
395 std::string sub_model_str;
396
400 bool overwrite_output;
401
405 bool fixed_blengths;
406
411 RealNumType threshold_prob;
412
416 RealNumType threshold_prob2;
417
421 int failure_limit_sample;
422
426 int failure_limit_subtree;
427
432 int failure_limit_subtree_short_search;
433
438 bool strict_stop_seeking_placement_sample;
439
443 bool strict_stop_seeking_placement_subtree;
444
449 bool strict_stop_seeking_placement_subtree_short_search;
450
455 RealNumType thresh_log_lh_sample;
456
461 RealNumType thresh_log_lh_subtree;
462
467 RealNumType thresh_log_lh_subtree_short_search;
468
472 RealNumType thresh_log_lh_failure;
473
478 RealNumType fixed_min_blength;
479
486 RealNumType min_blength_factor;
487
493 RealNumType max_blength_factor;
494
498 RealNumType min_blength_mid_factor;
499
504 RealNumType thresh_diff_update;
505
510 RealNumType thresh_diff_fold_update;
511
516 PositionType mutation_update_period;
517
521 std::string output_aln;
522
526 std::string output_aln_format_str;
527
531 std::string input_treefile;
532
537 int32_t num_tree_improvement;
538
544 RealNumType thresh_placement_cost;
545
551 RealNumType thresh_entire_tree_improvement;
552
557 RealNumType thresh_placement_cost_short_search;
558
562 std::string tree_format_str;
563
567 bool shallow_tree_search;
568
572 char* output_testing;
573
577 bool compute_aLRT_SH;
578
582 PositionType aLRT_SH_replicates;
583
587 RealNumType aLRT_SH_half_epsilon;
588
592 uint32_t num_threads;
593
599 uint64_t ran_seed;
600
604 std::string output_prefix;
605
610 bool allow_replace_input_tree;
611
615 std::string seq_type_str;
616
620 std::string tree_search_type_str;
621
625 bool make_consistent;
626
630 RealNumType max_subs_per_site;
631
635 RealNumType mean_subs_per_site;
636
637 /*
638 TRUE to log debugging
639 */
640 // bool debug = false;
641};
651 public:
656
664 ParamsBuilder& withRandomSeed(const uint64_t& seed);
665
672 ParamsBuilder& withThreshProb(const double& thresh_prob);
673
684 ParamsBuilder& withMinBlengthFactor(const double& min_blength_factor);
685
695 ParamsBuilder& withMaxBlengthFactor(const double& max_blength_factor);
696
704 ParamsBuilder& withFixedMinBlength(const double& fixed_min_blength);
705
714 const int32_t& mutation_update_period);
715
722 ParamsBuilder& withNumTreeTraversal(const int32_t& num_tree_traversal);
723
732 ParamsBuilder& withSPRThresh(const double& SPR_thresh);
733
742 ParamsBuilder& withStopTreeSearchThresh(const double& stop_search_thresh);
743
747 std::unique_ptr<cmaple::Params> build();
748
749 private:
753 std::unique_ptr<Params> params_ptr{};
754};
755
756/*--------------------------------------------------------------*/
757/*--------------------------------------------------------------*/
758
762void outError(const char* error, bool quit = true);
763
767void outError(const std::string& error, bool quit = true);
768
769/*--------------------------------------------------------------*/
770/*--------------------------------------------------------------*/
771
775void outError(const char* error, const char* msg, bool quit = true);
776
780void outError(const char* error, const std::string& msg, bool quit = true);
781
786void outWarning(const char* warn);
787void outWarning(const std::string& warn);
788
790std::istream& safeGetline(std::istream& is, std::string& t);
791
795inline bool controlchar(char& ch) {
796 return ch <= 32;
797}
798
799inline bool is_newick_token(char& ch) {
800 return ch == ':' || ch == ';' || ch == ',' || ch == ')' || ch == '(' ||
801 ch == '[' || ch == ']';
802}
803
809bool renameString(std::string& name);
810
811/*--------------------------------------------------------------*/
812/*--------------------------------------------------------------*/
813/*
814 Error messages
815 */
816const char ERR_NO_TAXON[] = "Find no taxon with name ";
817const char ERR_NO_AREA[] = "Find no area with name ";
818const char ERR_NO_MEMORY[] = "Not enough memory!";
819const char ERR_NEG_BRANCH[] = "Negative branch length not allowed.";
820const char ERR_READ_INPUT[] =
821 "File not found or incorrect input, pls check it again.";
822const char ERR_READ_ANY[] =
823 "Unidentified error while reading file, pls check it carefully again.";
824
825/*--------------------------------------------------------------*/
826/*--------------------------------------------------------------*/
827
833std::string convertPosTypeToString(PositionType number);
834std::string convertIntToString(int number);
835std::string convertInt64ToString(int64_t number);
836
837std::string convertDoubleToString(RealNumType number);
838
839std::string convertDoubleToString(RealNumType number, uint8_t precision);
840
845bool iEquals(const std::string& a, const std::string& b);
846
853bool copyFile(const char SRC[], const char DEST[]);
854
860bool fileExists(const std::string& strFilename);
861
865int isDirectory(const char* path);
866
873int convert_int(const char* str);
874
881int64_t convert_int64(const char* str);
882
890int convert_int(const char* str, int& end_pos);
891
898void convert_int_vec(const char* str, IntVector& vec);
899
906int64_t convert_int64(const char* str);
907
915int64_t convert_int64(const char* str, int& end_pos);
916
923RealNumType convert_real_number(const char* str);
924
932RealNumType convert_real_number(const char* str, int& end_pos);
933
940void convert_real_numbers(RealNumType*& arr, std::string input_str);
941
949void convert_real_number_vec(const char* str,
950 RealNumberVector& vec,
951 char separator = ',');
952
960void normalize_frequencies_from_index(RealNumType* freqs,
961 int num_states,
962 int starting_index);
963
970inline void normalize_arr(RealNumType* const entries,
971 const int num_entries,
972 RealNumType sum_entries) {
973 assert(num_entries > 0);
974
975#ifndef NDEBUG
976 // if (fabs(sum_entries) < 1e-5)
977 // outError("Sum of entries must be greater than zero!");
978#endif
979
980 sum_entries = 1.0 / sum_entries;
981 for (int i = 0; i < num_entries; ++i)
982 entries[i] *= sum_entries;
983 // entries[i] /= sum_entries;
984}
985
991inline void normalize_arr(RealNumType* const entries, const int num_entries) {
992 RealNumType sum_entries = 0;
993 for (int i = 0; i < num_entries; ++i)
994 sum_entries += entries[i];
995 normalize_arr(entries, num_entries, sum_entries);
996}
1002std::string convert_time(const RealNumType sec);
1003
1012void convert_range(const char* str, int& lower, int& upper, int& step_size);
1013
1022void convert_range(const char* str,
1023 RealNumType& lower,
1024 RealNumType& upper,
1025 RealNumType& step_size);
1026
1034void reinitDoubleArr(RealNumType*& arr,
1035 StateType size,
1036 bool delete_first = true,
1037 bool set_zero = true);
1038
1042void convert_string_vec(const char* str,
1043 StrVector& str_vec,
1044 char separator = ',');
1045
1052PositionType convert_positiontype(const char* str);
1053
1058bool is_number(const std::string& s);
1059
1066void parseArg(int argc, char* argv[], Params& params);
1067
1071void quickStartGuide();
1072
1076void usage_cmaple();
1077
1082void trimString(std::string& str);
1083
1087int countPhysicalCPUCores();
1088
1096template <class T1, class T2>
1097void quicksort(T1* arr, int left, int right, T2* arr2 = NULL) {
1098 if (left > right)
1099 return;
1100 assert(left <= right);
1101 int i = left, j = right;
1102 T1 pivot = arr[(left + right) / 2];
1103
1104 /* partition */
1105 while (i <= j) {
1106 while (arr[i] < pivot)
1107 ++i;
1108 while (arr[j] > pivot)
1109 --j;
1110 if (i <= j) {
1111 T1 tmp = arr[i];
1112 arr[i] = arr[j];
1113 arr[j] = tmp;
1114 if (arr2) {
1115 T2 tmp2 = arr2[i];
1116 arr2[i] = arr2[j];
1117 arr2[j] = tmp2;
1118 }
1119 ++i;
1120 --j;
1121 }
1122 };
1123
1124 /* recursion */
1125 if (left < j)
1126 quicksort(arr, left, j, arr2);
1127 if (i < right)
1128 quicksort(arr, i, right, arr2);
1129}
1130
1137void usage_cmaple(char* argv[], bool full_command);
1138
1142void printCopyright(std::ostream& out);
1143
1149void setNumThreads(const int num_threads);
1150
1154template <typename T, typename... Args>
1155std::unique_ptr<T> make_unique(Args&&... args) {
1156 return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
1157}
1158
1163void resetStream(std::istream& instream);
1164} // namespace cmaple
Definition tools.h:650
ParamsBuilder & withFixedMinBlength(const double &fixed_min_blength)
Specify a minimum value of the branch lengths. Default: the minimum branch length is computed from 'm...
ParamsBuilder & withRandomSeed(const uint64_t &seed)
Specify a seed number for random generators. Default: the clock of the PC. Be careful!...
std::unique_ptr< cmaple::Params > build()
Build the Params object after initializing parameters.
ParamsBuilder & withMinBlengthFactor(const double &min_blength_factor)
Specify a factor to compute the minimum branch length (if fixed_min_blength is specified,...
ParamsBuilder & withNumTreeTraversal(const int32_t &num_tree_traversal)
Specify the number of times we traverse the tree looking for topological improvements (applying SPR m...
ParamsBuilder & withStopTreeSearchThresh(const double &stop_search_thresh)
Specify a threshold to stop the tree search. If the total log likelihood improvement obtained by an i...
ParamsBuilder & withMaxBlengthFactor(const double &max_blength_factor)
Specify a factor to compute the maximum branch length. Default: 40 <Maximum branch length> = <max_b...
ParamsBuilder()
Default constructor - Initializing a builder using the default values.
ParamsBuilder & withMutationUpdatePeriod(const int32_t &mutation_update_period)
Specify the period (in term of the number of sample placements) to update the substitution rate matri...
ParamsBuilder & withThreshProb(const double &thresh_prob)
Specify a relative probability threshold, which is used to ignore possible states with very low proba...
ParamsBuilder & withSPRThresh(const double &SPR_thresh)
Specify a threshold that avoids CMAPLE trying to apply SPR moves on nodes that have the placement cos...