42 #include <unordered_map> 59 <<
"BitMagic DNA Search Sample (c) 2018" << std::endl
60 <<
"-fa file-name -- input FASTA file" << std::endl
61 <<
"-s hi|lo -- run substring search benchmark" << std::endl
62 <<
"-diag -- run diagnostics" << std::endl
63 <<
"-timing -- collect timings" << std::endl
82 for (
int i = 1; i < argc; ++i)
84 std::string arg = argv[i];
85 if ((arg ==
"-h") || (arg ==
"--help"))
91 if (arg ==
"-fa" || arg ==
"--fa")
99 std::cerr <<
"Error: -fa requires file name" << std::endl;
105 if (arg ==
"-diag" || arg ==
"--diag" || arg ==
"-d" || arg ==
"--d")
107 if (arg ==
"-timing" || arg ==
"--timing" || arg ==
"-t" || arg ==
"--t")
109 if (arg ==
"-bench" || arg ==
"--bench" || arg ==
"-b" || arg ==
"--b")
111 if (arg ==
"-search" || arg ==
"--search" || arg ==
"-s" || arg ==
"--s")
116 std::string a = argv[i+1];
119 if (a ==
"l" || a ==
"lo")
125 if (a ==
"h" || a ==
"hi")
142 typedef std::vector<std::pair<unsigned, std::string> >
dict_vect;
152 int load_FASTA(
const std::string& fname, std::vector<char>& seq_vect)
157 std::ifstream fin(fname.c_str(), std::ios::in);
162 for (
unsigned i = 0; std::getline(fin, line); ++i)
168 for (std::string::iterator it = line.begin(); it != line.end(); ++it)
169 seq_vect.push_back(*it);
183 enum { eA = 0, eC, eG,
eT, eN, eEnd };
189 void Build(
const vector<char>& sequence)
199 for (
size_t i = 0; i < sequence.size(); ++i)
201 unsigned pos = unsigned(i);
231 return m_FPrintBV[eA];
233 return m_FPrintBV[eC];
235 return m_FPrintBV[eG];
237 return m_FPrintBV[eT];
239 return m_FPrintBV[eN];
243 throw runtime_error(
"Error. Invalid letter!");
250 void Find(
const string& word, vector<unsigned>& res)
257 for (
size_t i = 1; i < word.size(); ++i)
270 unsigned ws = unsigned(word.size()) - 1;
271 TranslateResults(bv, ws, res);
283 for (
size_t i = 0; i < word.size(); ++i)
292 m_Agg.combine_shift_right_and(bv);
295 unsigned ws = unsigned(word.size()) - 1;
296 TranslateResults(bv, ws, res);
303 vector<vector<unsigned>>& hits)
305 vector<unique_ptr<aggregator_type> > agg_pipeline;
308 for (
const auto& w : words)
313 const string& word = get<0>(w);
314 for (
size_t i = 0; i < word.size(); ++i)
317 agg_ptr->add(&bv_mask);
320 agg_pipeline.emplace_back(agg_ptr.release());
321 ws = unsigned(word.size()) - 1;
326 vector<unique_ptr<aggregator_type> >::iterator>(agg_pipeline.begin(), agg_pipeline.end());
329 for (
size_t i = 0; i < agg_pipeline.size(); ++i)
331 const aggregator_type* agg_ptr = agg_pipeline[i].get();
333 vector<unsigned> res;
335 TranslateResults(*bv, ws, res);
336 hits.emplace_back(res);
346 vector<unsigned>& res)
349 for (; en.
valid(); ++en)
352 res.push_back(pos - left_shift);
368 vector<tuple<string,int>>& lo_words,
369 const vector<char>& data,
373 cout <<
"k-mer generation... " << endl;
378 if (data.size() < word_size)
381 size_t end_pos = data.size() - word_size;
383 map<string, int> words;
386 string s(&data[i], word_size);
387 if (s.find(
'N') == string::npos)
392 cout <<
"\r" << i <<
"/" << end_pos << flush;
396 cout << endl <<
"Picking k-mer samples..." << flush;
398 multimap<int,string, greater<int>> dst;
399 for_each(words.begin(), words.end(), [&](
const std::pair<string,int>& p)
401 dst.emplace(p.second, p.first);
404 auto it = dst.begin();
405 for(
size_t count = 0; count < N && it !=dst.end(); ++it,++count)
406 top_words.emplace_back(it->second, it->first);
410 auto it = dst.rbegin();
411 for(
size_t count = 0; count < N && it !=dst.rend(); ++it, ++count)
412 lo_words.emplace_back(it->second, it->first);
415 cout <<
"OK" << endl;
422 const char* word,
unsigned word_size,
425 if (data.size() < word_size)
429 size_t end_pos = data.size() - word_size;
433 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
435 if (data[j] != word[k] || data[i + l] != word[l])
442 r.push_back(
unsigned(i));
452 vector<const char*> words,
454 vector<vector<unsigned>>& hits)
456 if (data.size() < word_size)
460 size_t end_pos = data.size() - word_size;
461 size_t words_size = words.size();
464 for (
size_t idx = 0; idx < words_size; ++idx)
466 auto& word = words[idx];
468 for (
size_t j = i, k = 0, l = word_size - 1; l > k; ++j, ++k, --l)
470 if (data[j] != word[k] || data[i + l] != word[l])
478 hits[idx].push_back(
unsigned(i));
492 if (h1.size() != h2.size())
494 cerr <<
"size1 = " << h1.size() <<
" size2 = " << h2.size() << endl;
497 for (
size_t i = 0; i < h1.size(); ++i)
508 int main(
int argc,
char *argv[])
516 std::vector<char> seq_vect;
531 std::cout <<
"FASTA sequence size=" << seq_vect.size() << std::endl;
542 vector<tuple<string,int> > h_words;
543 vector<tuple<string,int> > l_words;
545 vector<tuple<string,int>>& words =
h_word_set ? h_words : l_words;
552 vector<THitList> word_hits;
553 vector<THitList> word_hits_agg;
561 vector<const char*> word_list;
562 for (
const auto& w : words)
564 word_list.push_back(get<0>(w).c_str());
566 word_hits.resize(words.size());
567 for_each(word_hits.begin(), word_hits.end(), [](
THitList& ht) {
587 for (
size_t word_idx = 0; word_idx < words.size(); ++ word_idx)
589 auto& word = get<0>(words[word_idx]);
595 word.c_str(), unsigned(word.size()),
601 idx.
Find(word, hits2);
613 cout <<
"Mismatch ERROR for: " << word << endl;
619 cout <<
"Sigle pass mismatch ERROR for: " << word << endl;
623 cout << word_idx <<
": " << word <<
": " << hits1.size() <<
" hits " << endl;
631 std::cout << std::endl <<
"Performance:" << std::endl;
635 catch (std::exception& ex)
637 std::cerr <<
"Error:" << ex.what() << std::endl;
Output iterator iterator designed to set "ON" bits based on input sequence of integers.
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
void aggregator_pipeline_execute(It first, It last)
Experimental method ro run multiple aggregators in sync.
void Find(const string &word, vector< unsigned > &res)
Find word strings using shift + and on fingerprint vectors (horizontal, non-fused basic method) ...
void Build(const vector< char > &sequence)
Build fingerprint bit-vectors from the original sequence.
bm::chrono_taker::duration_map_type timing_map
static bool hitlist_compare(const THitList &h1, const THitList &h2)
Check search result match.
Timing utilities for benchmarking (internal)
static void find_words(const vector< char > &data, vector< const char *> words, unsigned word_size, vector< vector< unsigned >> &hits)
Find all words in one pass (cache coherent algorithm) (variation of 2-way string matching for collect...
static void find_word_2way(vector< char > &data, const char *word, unsigned word_size, THitList &r)
2-way string matching
Utility for keeping all DNA finger print vectors and search using various techniques.
Algorithms for bvector<> (main include)
void FindCollection(const vector< tuple< string, int > > &words, vector< vector< unsigned >> &hits)
Find a set of words in one pass using pipeline of aggregators (this is very experimental) ...
static int load_FASTA(const std::string &fname, std::vector< char > &seq_vect)
Algorithms for fast aggregation of N bvectors.
void TranslateResults(const bm::bvector<> &bv, unsigned left_shift, vector< unsigned > &res)
Translate search results vector using (word size) left shift.
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
input set is sorted (ascending order)
std::map< std::string, statistics > duration_map_type
test name to duration map
const bvector_type * get_target() const
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
bool valid() const
Checks if iterator is still valid.
Utility class to collect performance measurements and statistics.
bool any() const
Returns true if any bits in this bitset are set, and otherwise returns false.
Algorithms for fast aggregation of a group of bit-vectors.
static void generate_kmers(vector< tuple< string, int >> &top_words, vector< tuple< string, int >> &lo_words, const vector< char > &data, size_t N, unsigned word_size)
generate the most frequent words of specified length from the input sequence
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs...
static int parse_args(int argc, char *argv[])
std::map< std::string, unsigned > freq_map
Constant iterator designed to enumerate "ON" bits.
void FindAggFused(const string &word, vector< unsigned > &res)
This method uses cache blocked aggregator with fused SHIFT+AND kernel.
int main(int argc, char *argv[])
bm::aggregator< bm::bvector<> > aggregator_type
const bm::bvector & GetVector(char letter) const
Return fingerprint bit-vector.
bool shift_right()
Shift right by 1 bit, fill with zero return carry out.
vector< unsigned > THitList
std::vector< std::pair< unsigned, std::string > > dict_vect
static const size_t WORD_SIZE