diff --git a/data/md5sum.txt b/data/md5sum.txt index e7f6123..d4ca674 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -6,11 +6,11 @@ aff19aeae4184c38cf364b3a94527098 tests/reads_pbat_pe_2.fq e94e71292fccd255bad3f3694efe7a4b tests/reads_rpbat_pe_1.fq 8bfa00acd0639dc66acd5aa8ac0369d5 tests/reads_rpbat_pe_2.fq bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx -ffff35a5110b3312555fc3ccaa78c99a tests/reads_pbat_pe.sam -af094fdc32b41a1605bb6d5b71e1b767 tests/reads_pe.sam -8370999d6ad6d5afc99ec483f16cc214 tests/reads_rpbat_pe.sam -a6d0c51aa69795b9bf3f9e424af35978 tests/reads.sam -6793c72a4957578afd2092f45f6dab5f tests/reads.mstats -8361153e3610769ed1697fd0551c4adf tests/reads_pbat_pe.mstats -00d3c8c0de72cca992b3561c7734b381 tests/reads_pe.mstats -dfe8102b04166f5127bddfc9c09893f0 tests/reads_rpbat_pe.mstats +e8a5fe0aec564a972ca7a3a36c0022f7 tests/reads_pbat_pe.sam +14e935a5b8a20c8a7422834c3327d1c7 tests/reads_pe.sam +98d6ad9d563e3318756bd5fc3fcbc263 tests/reads_rpbat_pe.sam +8126d46074213ad3674181f4ea4f8bd1 tests/reads.sam +981dd110d1675c77533a485204fb13cc tests/reads.mstats +ba2a0cdbc3e431adf9f85e36d00d783a tests/reads_pbat_pe.mstats +c8211fb637c03050b348e51f2ff91f3a tests/reads_pe.mstats +bd54ba1e720039cb4b662faa4f0e04b3 tests/reads_rpbat_pe.mstats diff --git a/src/abismal.cpp b/src/abismal.cpp index 639109b..51275b7 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -17,8 +17,18 @@ #include "abismal.hpp" +#include "AbismalAlign.hpp" +#include "AbismalIndex.hpp" +#include "OptionParser.hpp" +#include "bamxx.hpp" +#include "bisulfite_utils.hpp" +#include "dna_four_bit_bisulfite.hpp" +#include "popcnt.hpp" +#include "sam_record.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" + #include -#include #include #include @@ -32,16 +42,7 @@ #include #include -#include "AbismalAlign.hpp" -#include "AbismalIndex.hpp" -#include "OptionParser.hpp" -#include "bamxx.hpp" -#include "bisulfite_utils.hpp" -#include "dna_four_bit_bisulfite.hpp" -#include "popcnt.hpp" -#include "sam_record.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include using abismal_clock = std::chrono::steady_clock; using abismal_timepoint = std::chrono::time_point; @@ -431,8 +432,7 @@ chrom_and_posn(const ChromLookup &cl, const bam_cigar_t &cig, enum map_type { map_unmapped, map_unique, map_ambig }; static map_type -format_se(const bool allow_ambig, const bool strict_sam, const se_element &res, - const ChromLookup &cl, +format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, // ADS: 'read' should not be used after a call to 'format_se' std::string &read, const std::string &read_name, const bam_cigar_t &cigar, bam_rec &sr) { @@ -445,12 +445,11 @@ format_se(const bool allow_ambig, const bool strict_sam, const se_element &res, if (!valid || !chrom_and_posn(cl, cigar, res.pos, ref_s, ref_e, chrom_idx)) return map_unmapped; - // ADS: we might be doing format_se for a mate in paried reads + // ADS: we might be doing format_se for an end in paried reads std::uint16_t flag = 0; if (res.rc()) { flag |= BAM_FREVERSE; - if (strict_sam) - revcomp_inplace(read); + revcomp_inplace(read); } if (allow_ambig && ambig) @@ -591,8 +590,7 @@ valid_pair(const pe_element &best, const std::uint32_t readlen1, */ static map_type format_pe( - const bool allow_ambig, const bool strict_sam, const pe_element &p, - const ChromLookup &cl, + const bool allow_ambig, const pe_element &p, const ChromLookup &cl, // ADS: 'read1' and 'read2' should not be used after a call to format_pe std::string &read1, std::string &read2, const std::string &name1, const std::string &name2, const bam_cigar_t &cig1, const bam_cigar_t &cig2, @@ -627,14 +625,12 @@ format_pe( if (p.r1.rc()) { // ADS: is p.r1.rc() always !p.r2.rc()? flag1 |= BAM_FREVERSE; flag2 |= BAM_FMREVERSE; - if (strict_sam) - revcomp_inplace(read1); + revcomp_inplace(read1); } if (p.r2.rc()) { flag2 |= BAM_FREVERSE; flag1 |= BAM_FMREVERSE; - if (strict_sam) - revcomp_inplace(read2); + revcomp_inplace(read2); } if (allow_ambig && ambig) { // ADS: mark ambig for both the same way? @@ -927,13 +923,15 @@ struct se_map_stats { } [[nodiscard]] std::string - tostring(const std::size_t n_tabs = 0) const { + tostring(const std::string &label, const std::size_t n_tabs = 0) const { static constexpr auto tab = " "; constexpr auto pct = [](const double x) { return x * 100.0; }; std::string t; for (std::size_t i = 0; i < n_tabs; ++i) t += tab; std::ostringstream oss; + oss << t << label << ":\n"; + t += tab; // clang-format off oss << t << "total_reads: " << total_reads << '\n' << t << "mapped:\n" @@ -985,10 +983,10 @@ struct pe_map_stats { [[nodiscard]] std::string tostring(const bool allow_ambig) const { std::ostringstream oss; - oss << "pairs:\n" << both_stats.tostring(1); + oss << both_stats.tostring("pairs"); if (!allow_ambig) { - oss << "mate1:\n" << end1_stats.tostring(1); - oss << "mate2:\n" << end2_stats.tostring(1); + oss << end1_stats.tostring("read1"); + oss << end2_stats.tostring("read2"); } return oss.str(); } @@ -996,24 +994,23 @@ struct pe_map_stats { static void select_output( - const bool allow_ambig, const bool strict_sam, const ChromLookup &cl, + const bool allow_ambig, const ChromLookup &cl, // ADS: 'read1' and 'read2' should not be used after a call to 'select_output' std::string &read1, const std::string &name1, std::string &read2, const std::string &name2, const bam_cigar_t &cig1, const bam_cigar_t &cig2, pe_element &best, se_element &se1, se_element &se2, bam_rec &sr1, bam_rec &sr2) { - const map_type pe_map_type = - format_pe(allow_ambig, strict_sam, best, cl, read1, read2, name1, name2, - cig1, cig2, sr1, sr2); + const map_type pe_map_type = format_pe(allow_ambig, best, cl, read1, read2, + name1, name2, cig1, cig2, sr1, sr2); if (!best.should_report(allow_ambig) || pe_map_type == map_unmapped) { if (pe_map_type == map_unmapped) best.reset(); - if (format_se(allow_ambig, strict_sam, se1, cl, read1, name1, cig1, sr1) == + if (format_se(allow_ambig, se1, cl, read1, name1, cig1, sr1) == map_unmapped) se1.reset(); - if (format_se(allow_ambig, strict_sam, se2, cl, read2, name2, cig2, sr2) == + if (format_se(allow_ambig, se2, cl, read2, name2, cig2, sr2) == map_unmapped) se2.reset(); } @@ -1114,8 +1111,8 @@ find_candidates(const std::uint32_t max_candidates, low = the_bit ? first_1 : low; } - // some bit narrows it down to 0 candidates, roll back to when we - // had some candidates to work with. + // some bit narrows it down to 0 candidates, roll back to when we had some + // candidates to work with. if (low == high) { --p; low = prev_low; @@ -1179,8 +1176,8 @@ find_candidates_three(const std::uint32_t max_candidates, } } - // some bit narrows it down to 0 candidates, roll back to when we - // had some candidates to work with. + // some bit narrows it down to 0 candidates, roll back to when we had some + // candidates to work with. if (low == high) { --p; low = prev_low; @@ -1276,9 +1273,9 @@ process_seeds(const std::uint32_t max_candidates, const std::uint32_t lim_two = readlen - seed::key_weight + 1; - // GS: this is to avoid chasing down uninformative two-letter - // seeds when there is a sufficiently high number of three - // letter seeds that is lower than the number of two-letter hits + // GS: this is to avoid chasing down uninformative two-letter seeds when + // there is a sufficiently high number of three letter seeds that is lower + // than the number of two-letter hits static const std::uint32_t MIN_FOLD_SIZE = 10; for (i = 0; i < lim_two && !res.sure_ambig; ++i, ++read_idx) { s_idx = index_st + *(counter_st + k); @@ -1315,12 +1312,11 @@ prep_read(const std::string &r, Read &pread) { : (encode_base_t_rich[static_cast(r[i])])); } -/* GS: this function simply converts the vector pread - * to a vector by putting 16 bases in each element of - * the packed read. If the read length does not divide 16, we add - * 1111s to the remaining positions so it divides 16. The remaining - * bases match all bases in the reference genome - * */ +/* GS: this function simply converts the vector pread to a + * vector by putting 16 bases in each element of the packed read. If + * the read length does not divide 16, we add 1111s to the remaining positions + * so it divides 16. The remaining bases match all bases in the reference + * genome */ static void pack_read(const Read &pread, PackedRead &packed_pread) { static const element_t base_match_any = static_cast(0xF); @@ -1345,8 +1341,8 @@ pack_read(const Read &pread, PackedRead &packed_pread) { if (pread_ind == sz) return; - // now put only the remaining bases in the last pos. The rest - // should match any base in the reference + // now put only the remaining bases in the last pos. The rest should match + // any base in the reference *it = 0; std::size_t j = 0; while (pread_ind < sz) @@ -1443,8 +1439,8 @@ reset_bam_rec(bam_rec &b) { template static void map_single_ended(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const AbismalIndex &abismal_index, - ReadLoader &rl, se_map_stats &se_stats, bamxx::bam_header &hdr, + const AbismalIndex &abismal_index, ReadLoader &rl, + se_map_stats &se_stats, bamxx::bam_header &hdr, bamxx::bam_out &out, ProgressBar &progress) { const auto counter_st(std::cbegin(abismal_index.counter)); const auto counter_t_st(std::cbegin(abismal_index.counter_t)); @@ -1519,8 +1515,8 @@ map_single_ended(const bool show_progress, const bool allow_ambig, align_se_candidates(pread, pread_rc, pread, pread_rc, se_element::valid_frac, res, bests[i], cigar[i], aln); - if (format_se(allow_ambig, strict_sam, bests[i], abismal_index.cl, - reads[i], names[i], cigar[i], mr[i]) == map_unmapped) + if (format_se(allow_ambig, bests[i], abismal_index.cl, reads[i], + names[i], cigar[i], mr[i]) == map_unmapped) bests[i].reset(); } } @@ -1547,10 +1543,9 @@ map_single_ended(const bool show_progress, const bool allow_ambig, static void map_single_ended_rand(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const AbismalIndex &abismal_index, - ReadLoader &rl, se_map_stats &se_stats, - bamxx::bam_header &hdr, bamxx::bam_out &out, - ProgressBar &progress) { + const AbismalIndex &abismal_index, ReadLoader &rl, + se_map_stats &se_stats, bamxx::bam_header &hdr, + bamxx::bam_out &out, ProgressBar &progress) { const auto counter_st(std::cbegin(abismal_index.counter)); const auto counter_t_st(std::cbegin(abismal_index.counter_t)); const auto counter_a_st(std::cbegin(abismal_index.counter_a)); @@ -1575,8 +1570,7 @@ map_single_ended_rand(const bool show_progress, const bool allow_ambig, bests.resize(ReadLoader::batch_size); mr.resize(ReadLoader::batch_size); - // GS: pre-allocated variables used once per read - // and not used for reporting + // GS: pre-allocated variables used once per read and not used for reporting Read pread_t, pread_t_rc, pread_a, pread_a_rc; PackedRead packed_pread; se_candidates res; @@ -1633,8 +1627,8 @@ map_single_ended_rand(const bool show_progress, const bool allow_ambig, align_se_candidates(pread_t, pread_t_rc, pread_a, pread_a_rc, se_element::valid_frac, res, bests[i], cigar[i], aln); - if (format_se(allow_ambig, strict_sam, bests[i], abismal_index.cl, - reads[i], names[i], cigar[i], mr[i]) == map_unmapped) + if (format_se(allow_ambig, bests[i], abismal_index.cl, reads[i], + names[i], cigar[i], mr[i]) == map_unmapped) bests[i].reset(); } } @@ -1670,7 +1664,7 @@ format_duration(const abismal_timepoint start_time, template static void run_single_ended(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const std::string &reads_file, + const std::string &reads_file, const AbismalIndex &abismal_index, se_map_stats &se_stats, bamxx::bam_header &hdr, bamxx::bam_out &out) { ReadLoader rl(reads_file); @@ -1682,11 +1676,11 @@ run_single_ended(const bool show_progress, const bool allow_ambig, for (auto i = 0u; i < abismal_concurrency::n_threads; ++i) threads.emplace_back([&] { if (random_pbat) - map_single_ended_rand(show_progress, allow_ambig, strict_sam, - abismal_index, rl, se_stats, hdr, out, progress); + map_single_ended_rand(show_progress, allow_ambig, abismal_index, rl, + se_stats, hdr, out, progress); else - map_single_ended(show_progress, allow_ambig, strict_sam, - abismal_index, rl, se_stats, hdr, out, progress); + map_single_ended(show_progress, allow_ambig, abismal_index, rl, + se_stats, hdr, out, progress); }); for (auto &thread : threads) thread.join(); @@ -1809,7 +1803,7 @@ best_pair(const pe_candidates &res1, const pe_candidates &res2, s2.pos = best_pos2; s2.diffs = simple_aln::edit_distance(best_scr2, len2, cigar2); - // last check if, after alignment, mates are still concordant + // last check if, after alignment, ends are still concordant const std::uint32_t frag_end = best_pos2 + len2; if (frag_end >= best_pos1 + pe_element::min_dist && frag_end <= best_pos1 + pe_element::max_dist) { @@ -1883,8 +1877,8 @@ map_fragments(const std::uint32_t max_candidates, const std::string &read1, template static void map_paired_ended(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const AbismalIndex &abismal_index, - ReadLoader &rl1, ReadLoader &rl2, pe_map_stats &pe_stats, + const AbismalIndex &abismal_index, ReadLoader &rl1, + ReadLoader &rl2, pe_map_stats &pe_stats, bamxx::bam_header &hdr, bamxx::bam_out &out, ProgressBar &progress) { const auto counter_st(std::begin(abismal_index.counter)); @@ -2016,9 +2010,9 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, se_element::valid_frac / 2.0, res_se2, bests_se2[i], cigar2[i], aln); } - select_output(allow_ambig, strict_sam, abismal_index.cl, reads1[i], - names1[i], reads2[i], names2[i], cigar1[i], cigar2[i], - bests[i], bests_se1[i], bests_se2[i], mr1[i], mr2[i]); + select_output(allow_ambig, abismal_index.cl, reads1[i], names1[i], + reads2[i], names2[i], cigar1[i], cigar2[i], bests[i], + bests_se1[i], bests_se2[i], mr1[i], mr2[i]); } { @@ -2050,8 +2044,8 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, static void map_paired_ended_rand(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const AbismalIndex &abismal_index, - ReadLoader &rl1, ReadLoader &rl2, pe_map_stats &pe_stats, + const AbismalIndex &abismal_index, ReadLoader &rl1, + ReadLoader &rl2, pe_map_stats &pe_stats, bamxx::bam_header &hdr, bamxx::bam_out &out, ProgressBar &progress) { const auto counter_st(std::begin(abismal_index.counter)); @@ -2193,9 +2187,9 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, se_element::valid_frac / 2.0, res_se2, bests_se2[i], cigar2[i], aln); } - select_output(allow_ambig, strict_sam, abismal_index.cl, reads1[i], - names1[i], reads2[i], names2[i], cigar1[i], cigar2[i], - bests[i], bests_se1[i], bests_se2[i], mr1[i], mr2[i]); + select_output(allow_ambig, abismal_index.cl, reads1[i], names1[i], + reads2[i], names2[i], cigar1[i], cigar2[i], bests[i], + bests_se1[i], bests_se2[i], mr1[i], mr2[i]); } { @@ -2228,8 +2222,7 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, template static void run_paired_ended(const bool show_progress, const bool allow_ambig, - const bool strict_sam, const std::string &reads_file1, - const std::string &reads_file2, + const std::string &reads_file1, const std::string &reads_file2, const AbismalIndex &abismal_index, pe_map_stats &pe_stats, bamxx::bam_header &hdr, bamxx::bam_out &out) { ReadLoader rl1(reads_file1); @@ -2242,13 +2235,11 @@ run_paired_ended(const bool show_progress, const bool allow_ambig, for (auto i = 0u; i < abismal_concurrency::n_threads; ++i) threads.emplace_back([&] { if (random_pbat) - map_paired_ended_rand(show_progress, allow_ambig, strict_sam, - abismal_index, rl1, rl2, pe_stats, hdr, out, - progress); + map_paired_ended_rand(show_progress, allow_ambig, abismal_index, rl1, + rl2, pe_stats, hdr, out, progress); else - map_paired_ended(show_progress, allow_ambig, strict_sam, - abismal_index, rl1, rl2, pe_stats, hdr, out, - progress); + map_paired_ended(show_progress, allow_ambig, abismal_index, rl1, + rl2, pe_stats, hdr, out, progress); }); for (auto &thread : threads) thread.join(); @@ -2271,8 +2262,8 @@ file_exists(const std::string &filename) { static int abismal_make_sam_header(const ChromLookup &cl, const int argc, char *argv[], bamxx::bam_header &hdr) { - assert(cl.names.size() > 2); // two entries exist for the padding - assert(cl.starts.size() == cl.names.size() + 1); + assert(std::size(cl.names) > 2); // two entries exist for the padding + assert(std::size(cl.starts) == std::size(cl.names) + 1); const std::vector names(std::begin(cl.names) + 1, std::end(cl.names) - 1); std::vector sizes(names.size()); @@ -2298,8 +2289,8 @@ abismal_make_sam_header(const ChromLookup &cl, const int argc, char *argv[], // how the program was run std::ostringstream the_command; - copy(argv, argv + argc, - std::ostream_iterator(the_command, " ")); + std::copy(argv, argv + argc, + std::ostream_iterator(the_command, " ")); out << "CL:\"" << the_command.str() << "\"" << '\n'; hdr.h = sam_hdr_init(); @@ -2321,7 +2312,6 @@ abismal(int argc, char *argv[]) { bool pbat_mode = false; bool random_pbat = false; bool write_bam_fmt = false; - bool strict_sam = false; std::uint32_t max_candidates = 0; std::string index_file = ""; @@ -2358,9 +2348,6 @@ abismal(int argc, char *argv[]) { false, GA_conversion); opt_parse.add_opt("threads", 't', "number of threads", false, abismal_concurrency::n_threads); - opt_parse.add_opt("sam-orientation", 'S', - "reverse complement negative strand mappers", false, - strict_sam); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -2491,41 +2478,37 @@ abismal(int argc, char *argv[]) { if (reads_file2.empty()) { if (GA_conversion || pbat_mode) - run_single_ended(show_progress, allow_ambig, strict_sam, - reads_file, abismal_index, se_stats, - hdr, out); + run_single_ended(show_progress, allow_ambig, reads_file, + abismal_index, se_stats, hdr, out); else if (random_pbat) - run_single_ended(show_progress, allow_ambig, strict_sam, - reads_file, abismal_index, se_stats, hdr, - out); + run_single_ended(show_progress, allow_ambig, reads_file, + abismal_index, se_stats, hdr, out); else - run_single_ended(show_progress, allow_ambig, strict_sam, - reads_file, abismal_index, se_stats, - hdr, out); + run_single_ended(show_progress, allow_ambig, reads_file, + abismal_index, se_stats, hdr, out); } else { if (pbat_mode) - run_paired_ended(show_progress, allow_ambig, strict_sam, - reads_file, reads_file2, abismal_index, - pe_stats, hdr, out); + run_paired_ended(show_progress, allow_ambig, reads_file, + reads_file2, abismal_index, pe_stats, + hdr, out); else if (random_pbat) - run_paired_ended(show_progress, allow_ambig, strict_sam, - reads_file, reads_file2, abismal_index, - pe_stats, hdr, out); + run_paired_ended(show_progress, allow_ambig, reads_file, + reads_file2, abismal_index, pe_stats, + hdr, out); else - run_paired_ended(show_progress, allow_ambig, strict_sam, - reads_file, reads_file2, abismal_index, - pe_stats, hdr, out); + run_paired_ended(show_progress, allow_ambig, reads_file, + reads_file2, abismal_index, pe_stats, + hdr, out); } if (!stats_outfile.empty()) { std::ofstream stats_of(stats_outfile); if (stats_of) - stats_of << (reads_file2.empty() ? se_stats.tostring() + stats_of << (reads_file2.empty() ? se_stats.tostring("read1") : pe_stats.tostring(allow_ambig)); else - std::cerr << "failed to open stats output file: " << stats_outfile - << '\n'; + std::cerr << "failed to open stats out file: " << stats_outfile << '\n'; } } catch (const std::exception &e) {