lux/index/inverted_index_core.h

00001 #ifndef LUX_INDEX_INVERTEDINDEXCORE_H
00002 #define LUX_INDEX_INVERTEDINDEXCORE_H
00003 
00004 #include "lux/lux.h"
00005 #include "posting.h"
00006 #include "vbc.h"
00007 #include "lux/storage/storage_engine.h"
00008 #include "lux/document/document_definition.pb.h"
00009 #include "lux/document/document.h"
00010 #include "lux/lexer/term.h"
00011 #include "lux/scoped_ptr.h"
00012 
00013 #include <iostream>
00014 #include <string>
00015 #include <set>
00016 #include <map>
00017 #include <functional>
00018 #include <algorithm>
00019 
00020 #include <tr1/unordered_set>
00021 //#include <boost/pool/pool_alloc.hpp>
00022 
00023 #ifdef HAVE_TR1_MEMORY
00024 #include <tr1/memory>
00025 #elif HAVE_BOOST_SHARED_PTR_HPP
00026 #include <boost/shared_ptr.hpp>
00027 #endif
00028 
00029 namespace Lux {
00030 
00031   const std::string INDEX_SUFFIX = ".idx";
00032   
00033   typedef std::map< std::string, std::string > str_map;
00034   typedef str_map::iterator str_map_itr;
00035   typedef std::set< std::string > str_set;
00036   typedef str_set::iterator str_set_itr;
00037   typedef std::pair< str_set_itr, bool > str_set_result;
00038 
00039   template <class T>    
00040   struct less_index_to : public std::binary_function <T, T, bool> {
00041     bool operator() (const T& t1, const T& t2) const {
00042       return t1.index_to < t2.index_to;
00043     }
00044   };
00045 
00046   template <class T>    
00047   struct less_term : public std::binary_function <T, T, bool> {
00048     bool operator() (const T& t1, const T& t2) const {
00049       return t1.term.text_ < t2.term.text_;
00050     }
00051   };
00052 
00053   struct TermBuffer {
00054     TermBuffer(Term term_)
00055     : term(term_)
00056     {}
00057     Term term;
00058     pos_list_t buffer;
00059   };
00060 
00061   struct term_buffer_hash {
00062     std::tr1::hash<std::string> hs;
00063     size_t operator()(const TermBuffer& x) const {
00064       return static_cast<size_t>(hs(x.term.text_));
00065     }
00066   };
00067 
00068   struct term_buffer_equal {
00069     bool operator()(const TermBuffer& x, const TermBuffer& y) const {
00070       return x.term.text_ == y.term.text_;
00071     }
00072   };
00073 
00074   typedef std::tr1::unordered_set
00075     < TermBuffer,
00076       term_buffer_hash,
00077       term_buffer_equal
00078       //boost::pool_allocator<TermBuffer>
00079     > TermBufferSet;
00080 
00081   //typedef std::set< TermBuffer, less_term<TermBuffer> > TermBufferSet;
00082   typedef TermBufferSet::iterator TBIterator;
00083 
00084   struct IndexEngine {
00085     IndexEngine(const std::string &index_to_)
00086     : index_to(index_to_),
00087       engine(new LuxBtreeStorage)
00088     {}
00089 
00090     IndexEngine(const std::string &index_to_, bool is_for_comparison)
00091     : index_to(index_to_)
00092     {}
00093 
00094     bool open(const std::string &index_file_, db_flags_t open_params_)
00095     {
00096       return engine->open(index_file_.c_str(), open_params_);
00097     }
00098 
00099     bool close(void)
00100     {
00101       return engine->close();
00102     }
00103 
00104     std::string index_to;
00105 #ifdef HAVE_TR1_MEMORY
00106     std::tr1::shared_ptr<LuxBtreeStorage> engine;
00107 #elif HAVE_BOOST_SHARED_PTR_HPP
00108     boost::shared_ptr<LuxBtreeStorage> engine;
00109 #endif
00110   };
00111 
00112   typedef std::set< IndexEngine, less_index_to<IndexEngine> > IESet;
00113   typedef IESet::iterator IEIterator;
00114 
00118   class IndexEngineSet {
00119   public:
00120     IndexEngineSet(Config::Document &doc_config)
00121     : doc_config_(doc_config)
00122     {}
00123 
00124     bool open(std::string storage_dir, db_flags_t open_params)
00125     {
00126       for (int i = 0; i < doc_config_.field_size(); i++) {
00127         const Lux::Config::Field &field = doc_config_.field(i);
00128 
00129         if (!field.index().indexing()) { continue; }
00130 
00131         std::string index_to = field.index().index_to();
00132         str_set_result res = initialized_set_.insert(index_to);
00133         if (!res.second) { continue; }
00134 
00135         std::string index_file = index_to + INDEX_SUFFIX;
00136         index_file.insert(0, storage_dir + "/");
00137         IndexEngine ie(index_to);
00138         if (!ie.open(index_file, open_params)) {
00139           return false;
00140         }
00141         ie_set_.insert(ie);
00142       }
00143       return true;
00144     }
00145 
00146     bool close(void)
00147     {
00148       // all the engines will be closed automatically in destructor,
00149       // so this method does nothing for now.
00150       return true;
00151     }
00152 
00153     void flush(const std::string &index_to, const TermBufferSet &term_buffer_set)
00154     {
00155       // select engine
00156       IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00157 
00158       //ie_itr->engine
00159       TBIterator tb_itr = const_cast<TermBufferSet&>(term_buffer_set).begin();
00160       TBIterator tb_itr_end = const_cast<TermBufferSet&>(term_buffer_set).end(); 
00161       while (tb_itr != tb_itr_end) {
00162 
00163         Term t = tb_itr->term;
00164         std::vector<uint8_t> bs = vb_encode(const_cast<pos_list_t&>(tb_itr->buffer));
00165         vb_encode_num(0, bs); // mark
00166         
00167         LuxDataUnit key(const_cast<char *>(t.text_.c_str()), t.text_.size());
00168         //LuxDataUnit val(intarray, intarray_size);
00169         LuxDataUnit val((uint8_t *) &(bs[0]), bs.size());
00170         if (!ie_itr->engine->append(key, val)) {
00171           error_log("append to inverted index failed.");
00172           throw std::runtime_error("append failed.");
00173         }
00174 
00175         // clear and minimize idiom
00176         pos_list_t().swap(const_cast<pos_list_t&>(tb_itr->buffer));
00177 
00178         //delete [] intarray;
00179         ++tb_itr;
00180       }
00181     }
00182 
00183     bool exist(const std::string &index_to)
00184     {
00185       IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00186       if (ie_itr == ie_set_.end()) {
00187         return false;
00188       }
00189       return true;
00190     }
00191 
00192     IndexEngine get(const std::string &index_to)
00193     {
00194       IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00195       if (ie_itr == ie_set_.end()) {
00196         std::cerr << "fatal error: unexisted index " << index_to << std::endl;
00197       }
00198       return *ie_itr;
00199     }
00200     
00201   private:
00202     IESet ie_set_;
00203     str_set initialized_set_;
00204     Config::Document doc_config_;
00205   };
00206 
00210   struct IndexBuffer {
00211     IndexBuffer(std::string index_to_)
00212     : index_to(index_to_)
00213     {}
00214     std::string index_to;
00215     TermBufferSet term_buffer_set;
00216   };
00217   typedef std::set< IndexBuffer, less_index_to<IndexBuffer> > IBSet;
00218   typedef IBSet::iterator IBIterator;
00219 
00223   class IndexBufferSet {
00224 
00225   public:
00226     IndexBufferSet(Config::Document &doc_config)
00227     : curr_buffer_size_(0)
00228     {
00229       for (int i = 0; i < doc_config.field_size(); i++) {
00230         const Lux::Config::Field &field = doc_config.field(i);
00231 
00232         if (!field.index().indexing()) { continue; }
00233 
00234         IndexBuffer index_buffer(field.index().index_to());
00235         // IndexData with the same index_to will fail,
00236         // so no duplicate checking
00237         ib_set_.insert(index_buffer);
00238       }
00239     }
00240 
00241     // merge with TermBufferSet for index_to
00242     void merge(std::string &index_to, TermBufferSet &term_buffer_set)
00243     {
00244       IBIterator ib_itr = ib_set_.find(IndexBuffer(index_to));
00245       TBIterator tb_itr_end = term_buffer_set.end();
00246       for (TBIterator tb_itr = term_buffer_set.begin();
00247            tb_itr != tb_itr_end; ++tb_itr) {
00248 
00249         TBIterator tb_itr2 = const_cast<TermBufferSet&>(ib_itr->term_buffer_set).find(*tb_itr);
00250         if (tb_itr2 == ib_itr->term_buffer_set.end()) {
00251           const_cast<TermBufferSet&>(ib_itr->term_buffer_set).insert(*tb_itr);
00252           curr_buffer_size_ += tb_itr->buffer.size() * sizeof(uint32_t);
00253           curr_buffer_size_ += tb_itr->term.text_.size();
00254         } else {
00255           pos_list_itr itr = const_cast<pos_list_t&>(tb_itr->buffer).begin();
00256           pos_list_itr itr_end = const_cast<pos_list_t&>(tb_itr->buffer).end();
00257           for (; itr != itr_end; ++itr) {
00258             const_cast<pos_list_t&>(tb_itr2->buffer).push_back(*itr);
00259           }
00260           curr_buffer_size_ += tb_itr->buffer.size() * sizeof(uint32_t);
00261         }
00262       }
00263     }
00264 
00265     // it's not exact memory use for index buffer
00266     uint32_t get_buffer_size(void)
00267     {
00268       return curr_buffer_size_;
00269     }
00270 
00271     void print()
00272     {
00273       IBIterator ib_itr = ib_set_.begin(); 
00274       while (ib_itr != ib_set_.end()) {
00275         
00276         std::cout << "INDEX_TO: " << ib_itr->index_to << std::endl;
00277         TBIterator tb_itr = const_cast<TermBufferSet&>(ib_itr->term_buffer_set).begin(); 
00278         while (tb_itr != ib_itr->term_buffer_set.end()) {
00279           std::cout << "term [" << tb_itr->term.text_ << "] "
00280                     << std::endl;
00281           pos_list_itr pl_itr = const_cast<pos_list_t&>(tb_itr->buffer).begin();
00282           std::cout << "posting list : ";
00283           for (; pl_itr != tb_itr->buffer.end(); ++pl_itr) {
00284             std::cout << *pl_itr << ",";
00285           }
00286           std::cout << std::endl;
00287           ++tb_itr;
00288         }
00289         ++ib_itr;
00290       }
00291     }
00292 
00293     void flush(IndexEngineSet &index_engine_set)
00294     {
00295       IBIterator ib_itr = ib_set_.begin(); 
00296       while (ib_itr != ib_set_.end()) {
00297 
00298         index_engine_set.flush(ib_itr->index_to, ib_itr->term_buffer_set);
00299         const_cast<TermBufferSet&>(ib_itr->term_buffer_set).clear();
00300         ++ib_itr;
00301       }
00302       curr_buffer_size_ = 0;
00303     }
00304 
00305   private:
00306     IBSet ib_set_;
00307     // used as a threshold for flushing,
00308     // so it's not exact memory use for index buffer.
00309     uint32_t curr_buffer_size_; 
00310   };
00311 
00312 
00316   struct IndexData {
00317     IndexData(std::string index_to_)
00318     : index_to(index_to_)
00319     {}
00320     std::string index_to;
00321     std::string data;
00322     bool is_exact;
00323   };
00324   typedef std::set< IndexData, less_index_to<IndexData> > IDSet;
00325   typedef IDSet::iterator IDIterator;
00326 
00330   class IndexDataSet {
00331   public:
00335     IndexDataSet(Config::Document &doc_config)
00336     {
00337       for (int i = 0; i < doc_config.field_size(); i++) {
00338         const Lux::Config::Field &field = doc_config.field(i);
00339 
00340         if (!field.index().indexing()) { continue; }
00341 
00342         field_to_index_.insert(
00343             make_pair(field.name(), field.index().index_to())
00344           );
00345 
00346         IndexData index_data(field.index().index_to());
00347         index_data.is_exact = field.index().has_exact();
00348         // IndexData with the same index_to will fail
00349         id_set_.insert(index_data);
00350       }
00351     }
00352 
00356     void divide(const Document &doc)
00357     {
00358       doc.init_iter();
00359       while (doc.has_next()) {
00360         const Lux::Field *f = doc.get_next();
00361         str_map_itr sm_itr = field_to_index_.find(f->get_name());
00362         // this field is not indexed
00363         if (sm_itr == field_to_index_.end()) { continue; }
00364 
00365         IDIterator id_itr = id_set_.find(IndexData(sm_itr->second));
00366         if (id_itr->data.empty()) {
00367           const_cast<std::string&>(id_itr->data) = f->get_value(); 
00368         } else {
00369           // data for the same index is appended
00370           const_cast<std::string&>(id_itr->data) += "" + f->get_value(); 
00371         }
00372       }
00373     }
00374 
00375     void init_iter() { id_itr_ = id_set_.begin(); }
00376     bool has_next() { return id_itr_ != id_set_.end(); }
00377     IndexData get_next() { return *id_itr_++; }
00381     void clear_current_data() { const_cast<std::string&>(id_itr_->data) = ""; }
00382     void clear_data()
00383     {
00384       for (id_itr_ = id_set_.begin(); id_itr_ != id_set_.end(); ++id_itr_) {
00385         const_cast<std::string&>(id_itr_->data) = "";
00386       }
00387     }
00388 
00389   private:
00390     IDSet id_set_;
00391     IDIterator id_itr_;
00392     str_map field_to_index_;
00393   };
00394 
00395 }
00396 
00397 #endif

Generated on Fri Feb 5 15:50:30 2010 for Lux by  doxygen 1.4.7