00001 #ifndef LUX_INDEX_INVERTEDINDEXCORE_H
00002 #define LUX_INDEX_INVERTEDINDEXCORE_H
00003
00004 #include "lux/lux.h"
00005 #include "posting.h"
00006 #include "vbc.h"
00007 #include "lux/storage/storage_engine.h"
00008 #include "lux/document/document_definition.pb.h"
00009 #include "lux/document/document.h"
00010 #include "lux/lexer/term.h"
00011 #include "lux/scoped_ptr.h"
00012
00013 #include <iostream>
00014 #include <string>
00015 #include <set>
00016 #include <map>
00017 #include <functional>
00018 #include <algorithm>
00019
00020 #include <tr1/unordered_set>
00021
00022
00023 #ifdef HAVE_TR1_MEMORY
00024 #include <tr1/memory>
00025 #elif HAVE_BOOST_SHARED_PTR_HPP
00026 #include <boost/shared_ptr.hpp>
00027 #endif
00028
00029 namespace Lux {
00030
00031 const std::string INDEX_SUFFIX = ".idx";
00032
00033 typedef std::map< std::string, std::string > str_map;
00034 typedef str_map::iterator str_map_itr;
00035 typedef std::set< std::string > str_set;
00036 typedef str_set::iterator str_set_itr;
00037 typedef std::pair< str_set_itr, bool > str_set_result;
00038
00039 template <class T>
00040 struct less_index_to : public std::binary_function <T, T, bool> {
00041 bool operator() (const T& t1, const T& t2) const {
00042 return t1.index_to < t2.index_to;
00043 }
00044 };
00045
00046 template <class T>
00047 struct less_term : public std::binary_function <T, T, bool> {
00048 bool operator() (const T& t1, const T& t2) const {
00049 return t1.term.text_ < t2.term.text_;
00050 }
00051 };
00052
00053 struct TermBuffer {
00054 TermBuffer(Term term_)
00055 : term(term_)
00056 {}
00057 Term term;
00058 pos_list_t buffer;
00059 };
00060
00061 struct term_buffer_hash {
00062 std::tr1::hash<std::string> hs;
00063 size_t operator()(const TermBuffer& x) const {
00064 return static_cast<size_t>(hs(x.term.text_));
00065 }
00066 };
00067
00068 struct term_buffer_equal {
00069 bool operator()(const TermBuffer& x, const TermBuffer& y) const {
00070 return x.term.text_ == y.term.text_;
00071 }
00072 };
00073
00074 typedef std::tr1::unordered_set
00075 < TermBuffer,
00076 term_buffer_hash,
00077 term_buffer_equal
00078
00079 > TermBufferSet;
00080
00081
00082 typedef TermBufferSet::iterator TBIterator;
00083
00084 struct IndexEngine {
00085 IndexEngine(const std::string &index_to_)
00086 : index_to(index_to_),
00087 engine(new LuxBtreeStorage)
00088 {}
00089
00090 IndexEngine(const std::string &index_to_, bool is_for_comparison)
00091 : index_to(index_to_)
00092 {}
00093
00094 bool open(const std::string &index_file_, db_flags_t open_params_)
00095 {
00096 return engine->open(index_file_.c_str(), open_params_);
00097 }
00098
00099 bool close(void)
00100 {
00101 return engine->close();
00102 }
00103
00104 std::string index_to;
00105 #ifdef HAVE_TR1_MEMORY
00106 std::tr1::shared_ptr<LuxBtreeStorage> engine;
00107 #elif HAVE_BOOST_SHARED_PTR_HPP
00108 boost::shared_ptr<LuxBtreeStorage> engine;
00109 #endif
00110 };
00111
00112 typedef std::set< IndexEngine, less_index_to<IndexEngine> > IESet;
00113 typedef IESet::iterator IEIterator;
00114
00118 class IndexEngineSet {
00119 public:
00120 IndexEngineSet(Config::Document &doc_config)
00121 : doc_config_(doc_config)
00122 {}
00123
00124 bool open(std::string storage_dir, db_flags_t open_params)
00125 {
00126 for (int i = 0; i < doc_config_.field_size(); i++) {
00127 const Lux::Config::Field &field = doc_config_.field(i);
00128
00129 if (!field.index().indexing()) { continue; }
00130
00131 std::string index_to = field.index().index_to();
00132 str_set_result res = initialized_set_.insert(index_to);
00133 if (!res.second) { continue; }
00134
00135 std::string index_file = index_to + INDEX_SUFFIX;
00136 index_file.insert(0, storage_dir + "/");
00137 IndexEngine ie(index_to);
00138 if (!ie.open(index_file, open_params)) {
00139 return false;
00140 }
00141 ie_set_.insert(ie);
00142 }
00143 return true;
00144 }
00145
00146 bool close(void)
00147 {
00148
00149
00150 return true;
00151 }
00152
00153 void flush(const std::string &index_to, const TermBufferSet &term_buffer_set)
00154 {
00155
00156 IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00157
00158
00159 TBIterator tb_itr = const_cast<TermBufferSet&>(term_buffer_set).begin();
00160 TBIterator tb_itr_end = const_cast<TermBufferSet&>(term_buffer_set).end();
00161 while (tb_itr != tb_itr_end) {
00162
00163 Term t = tb_itr->term;
00164 std::vector<uint8_t> bs = vb_encode(const_cast<pos_list_t&>(tb_itr->buffer));
00165 vb_encode_num(0, bs);
00166
00167 LuxDataUnit key(const_cast<char *>(t.text_.c_str()), t.text_.size());
00168
00169 LuxDataUnit val((uint8_t *) &(bs[0]), bs.size());
00170 if (!ie_itr->engine->append(key, val)) {
00171 error_log("append to inverted index failed.");
00172 throw std::runtime_error("append failed.");
00173 }
00174
00175
00176 pos_list_t().swap(const_cast<pos_list_t&>(tb_itr->buffer));
00177
00178
00179 ++tb_itr;
00180 }
00181 }
00182
00183 bool exist(const std::string &index_to)
00184 {
00185 IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00186 if (ie_itr == ie_set_.end()) {
00187 return false;
00188 }
00189 return true;
00190 }
00191
00192 IndexEngine get(const std::string &index_to)
00193 {
00194 IEIterator ie_itr = ie_set_.find(IndexEngine(index_to, true));
00195 if (ie_itr == ie_set_.end()) {
00196 std::cerr << "fatal error: unexisted index " << index_to << std::endl;
00197 }
00198 return *ie_itr;
00199 }
00200
00201 private:
00202 IESet ie_set_;
00203 str_set initialized_set_;
00204 Config::Document doc_config_;
00205 };
00206
00210 struct IndexBuffer {
00211 IndexBuffer(std::string index_to_)
00212 : index_to(index_to_)
00213 {}
00214 std::string index_to;
00215 TermBufferSet term_buffer_set;
00216 };
00217 typedef std::set< IndexBuffer, less_index_to<IndexBuffer> > IBSet;
00218 typedef IBSet::iterator IBIterator;
00219
00223 class IndexBufferSet {
00224
00225 public:
00226 IndexBufferSet(Config::Document &doc_config)
00227 : curr_buffer_size_(0)
00228 {
00229 for (int i = 0; i < doc_config.field_size(); i++) {
00230 const Lux::Config::Field &field = doc_config.field(i);
00231
00232 if (!field.index().indexing()) { continue; }
00233
00234 IndexBuffer index_buffer(field.index().index_to());
00235
00236
00237 ib_set_.insert(index_buffer);
00238 }
00239 }
00240
00241
00242 void merge(std::string &index_to, TermBufferSet &term_buffer_set)
00243 {
00244 IBIterator ib_itr = ib_set_.find(IndexBuffer(index_to));
00245 TBIterator tb_itr_end = term_buffer_set.end();
00246 for (TBIterator tb_itr = term_buffer_set.begin();
00247 tb_itr != tb_itr_end; ++tb_itr) {
00248
00249 TBIterator tb_itr2 = const_cast<TermBufferSet&>(ib_itr->term_buffer_set).find(*tb_itr);
00250 if (tb_itr2 == ib_itr->term_buffer_set.end()) {
00251 const_cast<TermBufferSet&>(ib_itr->term_buffer_set).insert(*tb_itr);
00252 curr_buffer_size_ += tb_itr->buffer.size() * sizeof(uint32_t);
00253 curr_buffer_size_ += tb_itr->term.text_.size();
00254 } else {
00255 pos_list_itr itr = const_cast<pos_list_t&>(tb_itr->buffer).begin();
00256 pos_list_itr itr_end = const_cast<pos_list_t&>(tb_itr->buffer).end();
00257 for (; itr != itr_end; ++itr) {
00258 const_cast<pos_list_t&>(tb_itr2->buffer).push_back(*itr);
00259 }
00260 curr_buffer_size_ += tb_itr->buffer.size() * sizeof(uint32_t);
00261 }
00262 }
00263 }
00264
00265
00266 uint32_t get_buffer_size(void)
00267 {
00268 return curr_buffer_size_;
00269 }
00270
00271 void print()
00272 {
00273 IBIterator ib_itr = ib_set_.begin();
00274 while (ib_itr != ib_set_.end()) {
00275
00276 std::cout << "INDEX_TO: " << ib_itr->index_to << std::endl;
00277 TBIterator tb_itr = const_cast<TermBufferSet&>(ib_itr->term_buffer_set).begin();
00278 while (tb_itr != ib_itr->term_buffer_set.end()) {
00279 std::cout << "term [" << tb_itr->term.text_ << "] "
00280 << std::endl;
00281 pos_list_itr pl_itr = const_cast<pos_list_t&>(tb_itr->buffer).begin();
00282 std::cout << "posting list : ";
00283 for (; pl_itr != tb_itr->buffer.end(); ++pl_itr) {
00284 std::cout << *pl_itr << ",";
00285 }
00286 std::cout << std::endl;
00287 ++tb_itr;
00288 }
00289 ++ib_itr;
00290 }
00291 }
00292
00293 void flush(IndexEngineSet &index_engine_set)
00294 {
00295 IBIterator ib_itr = ib_set_.begin();
00296 while (ib_itr != ib_set_.end()) {
00297
00298 index_engine_set.flush(ib_itr->index_to, ib_itr->term_buffer_set);
00299 const_cast<TermBufferSet&>(ib_itr->term_buffer_set).clear();
00300 ++ib_itr;
00301 }
00302 curr_buffer_size_ = 0;
00303 }
00304
00305 private:
00306 IBSet ib_set_;
00307
00308
00309 uint32_t curr_buffer_size_;
00310 };
00311
00312
00316 struct IndexData {
00317 IndexData(std::string index_to_)
00318 : index_to(index_to_)
00319 {}
00320 std::string index_to;
00321 std::string data;
00322 bool is_exact;
00323 };
00324 typedef std::set< IndexData, less_index_to<IndexData> > IDSet;
00325 typedef IDSet::iterator IDIterator;
00326
00330 class IndexDataSet {
00331 public:
00335 IndexDataSet(Config::Document &doc_config)
00336 {
00337 for (int i = 0; i < doc_config.field_size(); i++) {
00338 const Lux::Config::Field &field = doc_config.field(i);
00339
00340 if (!field.index().indexing()) { continue; }
00341
00342 field_to_index_.insert(
00343 make_pair(field.name(), field.index().index_to())
00344 );
00345
00346 IndexData index_data(field.index().index_to());
00347 index_data.is_exact = field.index().has_exact();
00348
00349 id_set_.insert(index_data);
00350 }
00351 }
00352
00356 void divide(const Document &doc)
00357 {
00358 doc.init_iter();
00359 while (doc.has_next()) {
00360 const Lux::Field *f = doc.get_next();
00361 str_map_itr sm_itr = field_to_index_.find(f->get_name());
00362
00363 if (sm_itr == field_to_index_.end()) { continue; }
00364
00365 IDIterator id_itr = id_set_.find(IndexData(sm_itr->second));
00366 if (id_itr->data.empty()) {
00367 const_cast<std::string&>(id_itr->data) = f->get_value();
00368 } else {
00369
00370 const_cast<std::string&>(id_itr->data) += "" + f->get_value();
00371 }
00372 }
00373 }
00374
00375 void init_iter() { id_itr_ = id_set_.begin(); }
00376 bool has_next() { return id_itr_ != id_set_.end(); }
00377 IndexData get_next() { return *id_itr_++; }
00381 void clear_current_data() { const_cast<std::string&>(id_itr_->data) = ""; }
00382 void clear_data()
00383 {
00384 for (id_itr_ = id_set_.begin(); id_itr_ != id_set_.end(); ++id_itr_) {
00385 const_cast<std::string&>(id_itr_->data) = "";
00386 }
00387 }
00388
00389 private:
00390 IDSet id_set_;
00391 IDIterator id_itr_;
00392 str_map field_to_index_;
00393 };
00394
00395 }
00396
00397 #endif