lux/index/posting.h

00001 #ifndef LUX_INDEX_POSTING_H
00002 #define LUX_INDEX_POSTING_H
00003 
00004 #include <vector>
00005 #include <deque>
00006 #include <iostream>
00007 #include <stdint.h>
00008 
00009 // test
00010 #include <boost/pool/pool_alloc.hpp>
00011 
00012 namespace Lux {
00013 
00014   //typedef std::vector< uint32_t,
00015   //                     boost::pool_allocator<uint32_t> > pos_list_t;
00016   //typedef std::vector<uint32_t> pos_list_t;
00017   typedef std::deque<uint32_t> pos_list_t;
00018   typedef pos_list_t::iterator pos_list_itr;
00019 
00020   struct Posting {
00021     Posting(void)
00022     {}
00023     Posting(pos_list_t _list, bool _skipped)
00024     : list(_list), skipped(_skipped)
00025     {}
00026 
00027     pos_list_t list;
00028     bool skipped;
00029   };
00030 
00031   struct PostingInf {
00032     PostingInf(uint32_t size_, uint32_t idx_)
00033     : size(size_), idx(idx_)
00034     {}
00035     uint32_t size;
00036     uint32_t idx;
00037   };
00038   typedef std::vector<PostingInf> PostingInfs;
00039   typedef PostingInfs::iterator PostingInfsItr;
00040 
00041   struct sort_by_size : public std::binary_function<PostingInf, PostingInf, bool> {
00042     bool operator()(const PostingInf &t1, const PostingInf &t2) {
00043       return t1.size < t2.size;
00044     }
00045   };
00046 
00052   // next number for uint8_t *
00053   static inline
00054   int32_t next(uint8_t *b, int &i, int size)
00055   {
00056     uint32_t n = 0;
00057     for (; i < size; ++i) {
00058       n = 128 * n + b[i];
00059       if (b[i] >= 128) {
00060         n -= 128;
00061         ++i;
00062         return n;
00063       }
00064     } 
00065     return -1;
00066   }
00067 
00068   static inline
00069   int32_t next(uint8_t *b, int &i, int size, uint32_t &prev)
00070   {
00071     uint32_t dp = next(b, i, size);
00072     if (dp > 0) {
00073       dp += prev;
00074     } else if (dp == 0) {
00075       do {
00076         dp = next(b, i, size);
00077       } while (dp == 0);
00078     }
00079     prev = dp;
00080     return dp;
00081   }
00082 
00083   static inline
00084   void skip(uint8_t *b, int &i, int size, int skip)
00085   {
00086     for (; i < size; ++i) {
00087       if (b[i] >= 128) {
00088         --skip;
00089         if (skip == 0) {
00090           ++i;
00091           return;
00092         }
00093       }
00094     } 
00095   }
00096 
00097   // next doc for pos_list_t
00098   static inline
00099   int32_t next(pos_list_itr *itr, pos_list_itr itr_end)
00100   {
00101     if (*itr != itr_end) {
00102       *itr += *(++*itr) + 1;
00103       if (*itr != itr_end) {
00104         return **itr;
00105       }
00106     }
00107     return -1;
00108   }
00109 
00110   // next position for pos_list_t
00111   static inline
00112   int32_t next(pos_list_itr *itr, int *n) {
00113     --*n;
00114     if (*n >= 0) {
00115       return *(++*itr);
00116     }
00117     return -1;
00118   }
00119 
00120   // intersection between uint8_t* arrays for postions
00121   static inline pos_list_t
00122   _intersect_nodiff(uint8_t *bp, int &ip, int sp,
00123              uint8_t *bn, int &in, int sn, int len)
00124   {
00125     pos_list_t pl;
00126     //pl.reserve(128); // for eliminating unnecessary copies
00127 
00128     int np = next(bp, ip, sp) - 1;
00129     int nn = next(bn, in, sn) - 1;
00130     int pp = next(bp, ip, sp);
00131     int pn = next(bn, in, sn);
00132 
00133     while (1) {
00134       if (pp + len == pn) {
00135         pl.push_back(pp);
00136         if (np > 0) { pp = next(bp, ip, sp); }
00137         if (nn > 0) { pn = next(bn, in, sn); }
00138         --nn;
00139         --np;
00140       } else if (pp + len > pn) {
00141         if (nn > 0) { pn = next(bn, in, sn); }
00142         --nn;
00143       } else {
00144         if (np > 0) { pp = next(bp, ip, sp); }
00145         --np;
00146       }
00147 
00148       if (np < 0 && nn < 0) break;
00149       if (np < 0 && pn > pp + len) {
00150         if (nn > 0) { skip(bn, in, sn, nn); }
00151         break;
00152       }
00153       if (nn < 0 && pp + len > pn) {
00154         if (np > 0) { skip(bp, ip, sp, np); }
00155         break;  
00156       }
00157     }
00158     return pl;
00159   }
00160 
00161   // intersection between uint8_t* arrays for postions
00162   static inline pos_list_t
00163   _intersect(uint8_t *bp, int &ip, int sp,
00164               uint8_t *bn, int &in, int sn, int len)
00165   {
00166     pos_list_t pl;
00167     //pl.reserve(128); // for eliminating unnecessary copies
00168 
00169     int np = next(bp, ip, sp) - 1;
00170     int nn = next(bn, in, sn) - 1;
00171     int pp = next(bp, ip, sp);
00172     int pn = next(bn, in, sn);
00173 
00174     while (1) {
00175       if (pp + len == pn) {
00176         pl.push_back(pp);
00177         if (np > 0) { pp += next(bp, ip, sp); }
00178         if (nn > 0) { pn += next(bn, in, sn); }
00179         --nn;
00180         --np;
00181       } else if (pp + len > pn) {
00182         if (nn > 0) { pn += next(bn, in, sn); }
00183         --nn;
00184       } else {
00185         if (np > 0) { pp += next(bp, ip, sp); }
00186         --np;
00187       }
00188 
00189       if (np < 0 && nn < 0) break;
00190       if (np < 0 && pn > pp + len) {
00191         if (nn > 0) { skip(bn, in, sn, nn); }
00192         break;
00193       }
00194       if (nn < 0 && pp + len > pn) {
00195         if (np > 0) { skip(bp, ip, sp, np); }
00196         break;  
00197       }
00198     }
00199     return pl;
00200   }
00201 
00202   // intersection between uint8_t* arrays for docid
00203   static inline 
00204   pos_list_t intersect_nodiff(uint8_t *bp, uint32_t sp,
00205                               uint8_t *bn, uint32_t sn, int len)
00206   {
00207     pos_list_t pl;
00208     if (sp == 0 || sn == 0) { return pl; }
00209     //pl.reserve(sp > sn ? sp : sn);
00210 
00211     int ip = 0, in = 0;
00212     int dp = next(bp, ip, sp);
00213     int dn = next(bn, in, sn);
00214 
00215     while (1) {
00216       if (dp == dn) {
00217         pos_list_t tp = _intersect_nodiff(bp, ip , sp, bn, in, sn, len);
00218         if (!tp.empty()) {
00219           pl.push_back(dp);
00220           pl.push_back(tp.size());
00221           pos_list_itr itr_end = tp.end();
00222           for (pos_list_itr itr = tp.begin();
00223                itr != itr_end; ++itr) {
00224             pl.push_back(*itr);
00225           }
00226         }
00227         dp = next(bp, ip, sp);
00228         dn = next(bn, in, sn);
00229       } else if (dp > dn) {
00230         skip(bn, in, sn, next(bn, in, sn));
00231         dn = next(bn, in, sn);
00232       } else {
00233         skip(bp, ip, sp, next(bp, ip, sp));
00234         dp = next(bp, ip, sp);
00235       }
00236 
00237       if ((ip == sp && in == sn) ||
00238           (ip == sp && dn > dp) || 
00239           (in == sn && dp > dn)) {
00240           break;
00241       }
00242     }
00243     return pl;
00244   }
00245 
00246   // intersection between uint8_t* arrays for docid
00247   static inline 
00248   pos_list_t intersect(uint8_t *bp, uint32_t sp,
00249                         uint8_t *bn, uint32_t sn, int len)
00250   {
00251     pos_list_t pl;
00252     if (sp == 0 || sn == 0) { return pl; }
00253     //pl.reserve(sp > sn ? sp : sn);
00254 
00255     int ip = 0, in = 0;
00256     uint32_t prev_dp = 0, prev_dn = 0;
00257     int dp = next(bp, ip, sp, prev_dp);
00258     int dn = next(bn, in, sn, prev_dn);
00259 
00260     while (1) {
00261       if (dp == dn) {
00262         pos_list_t tp = _intersect(bp, ip, sp, bn, in, sn, len);
00263         if (!tp.empty()) {
00264           pl.push_back(dp);
00265           pl.push_back(tp.size());
00266           pos_list_itr itr_end = tp.end();
00267           for (pos_list_itr itr = tp.begin();
00268                itr != itr_end; ++itr) {
00269             pl.push_back(*itr);
00270           }
00271         }
00272         dp = next(bp, ip, sp, prev_dp);
00273         dn = next(bn, in, sn, prev_dn);
00274       } else if (dp > dn) {
00275         skip(bn, in, sn, next(bn, in, sn));
00276         dn = next(bn, in, sn, prev_dn);
00277       } else {
00278         skip(bp, ip, sp, next(bp, ip, sp));
00279         dp = next(bp, ip, sp, prev_dp);
00280       }
00281 
00282       if ((ip == sp && in == sn) ||
00283           (ip == sp && dn > dp) || 
00284           (in == sn && dp > dn)) {
00285           break;
00286       }
00287     }
00288     return pl;
00289   }
00290 
00291 
00292   // intersection between uint8_t* array and pos_list_t for positions
00293   static inline pos_list_t
00294   _intersect_nodiff(uint8_t *bp, int &ip, int sp, pos_list_itr itr, int len)
00295   {
00296     pos_list_t pl;
00297     //pl.reserve(128); // for eliminating unnecessary copies
00298 
00299     int np = next(bp, ip, sp) - 1;
00300     int nn = *(++itr) - 1;
00301     int pp = next(bp, ip, sp);
00302     int pn = *(++itr);
00303 
00304     while (1) {
00305       if (pp + len == pn) {
00306         pl.push_back(pp);
00307         if (np > 0) { pp = next(bp, ip, sp); }
00308         pn = next(&itr, &nn);
00309         --np;
00310       } else if (pp + len > pn) {
00311         pn = next(&itr, &nn);
00312       } else {
00313         if (np > 0) { pp = next(bp, ip, sp); }
00314         --np;
00315       }
00316 
00317       if (np < 0 && nn < 0) break;
00318       if (np < 0 && pn > pp + len) {
00319         break;
00320       }
00321       if (nn < 0 && pp + len > pn) {
00322         if (np > 0) { skip(bp, ip, sp, np); }
00323         break;  
00324       }
00325     }
00326     return pl;
00327   }
00328 
00329   // intersection between uint8_t* array and pos_list_t for positions
00330   static inline pos_list_t
00331   _intersect(uint8_t *bp, int &ip, int sp, pos_list_itr itr, int len)
00332   {
00333     pos_list_t pl;
00334     //pl.reserve(128); // for eliminating unnecessary copies
00335 
00336     int np = next(bp, ip, sp) - 1;
00337     int nn = *(++itr) - 1;
00338     int pp = next(bp, ip, sp);
00339     int pn = *(++itr);
00340 
00341     while (1) {
00342       if (pp + len == pn) {
00343         pl.push_back(pp);
00344         if (np > 0) { pp += next(bp, ip, sp); }
00345         pn = next(&itr, &nn);
00346         --np;
00347       } else if (pp + len > pn) {
00348         pn = next(&itr, &nn);
00349       } else {
00350         if (np > 0) { pp += next(bp, ip, sp); }
00351         --np;
00352       }
00353 
00354       if (np < 0 && nn < 0) break;
00355       if (np < 0 && pn > pp + len) {
00356         break;
00357       }
00358       if (nn < 0 && pp + len > pn) {
00359         if (np > 0) { skip(bp, ip, sp, np); }
00360         break;  
00361       }
00362     }
00363     return pl;
00364   }
00365 
00366   // intersection between uint8_t* array and pos_list_t for docid
00367   static inline 
00368   pos_list_t intersect_nodiff(uint8_t *bp, uint32_t sp, pos_list_t &pn, int len)
00369   {
00370     pos_list_t pl;
00371     if (sp == 0 || pn.size() == 0) { return pl; }
00372     //pl.reserve(sp > pn.size() ? sp : pn.size());
00373 
00374     pos_list_itr itrn = pn.begin();
00375     pos_list_itr itrn_end = pn.end();
00376 
00377     int ip = 0;
00378     int dp = next(bp, ip, sp);
00379     int dn = *itrn;
00380 
00381     while (1) {
00382       if (dp == dn) {
00383         pos_list_t tp = _intersect_nodiff(bp, ip , sp, itrn, len);
00384         if (!tp.empty()) {
00385           pl.push_back(dp);
00386           pl.push_back(tp.size());
00387           pos_list_itr itr_end = tp.end();
00388           for (pos_list_itr itr = tp.begin();
00389                itr != itr_end; ++itr) {
00390             pl.push_back(*itr);
00391           }
00392         }
00393         dp = next(bp, ip, sp);
00394         dn = next(&itrn, itrn_end);
00395       } else if (dp > dn) {
00396         dn = next(&itrn, itrn_end);
00397       } else {
00398         skip(bp, ip, sp, next(bp, ip, sp));
00399         dp = next(bp, ip, sp);
00400       }
00401 
00402       if ((ip == sp && itrn == itrn_end) ||
00403           (ip == sp && dn > dp) || 
00404           (itrn == itrn_end && dp > dn)) {
00405           break;
00406       }
00407     }
00408     return pl;
00409   }
00410 
00411   // intersection between uint8_t* array and pos_list_t for docid
00412   static inline 
00413   pos_list_t intersect(uint8_t *bp, uint32_t sp, pos_list_t &pn, int len)
00414   {
00415     pos_list_t pl;
00416     if (sp == 0 || pn.size() == 0) { return pl; }
00417     //pl.reserve(sp > pn.size() ? sp : pn.size());
00418 
00419     pos_list_itr itrn = pn.begin();
00420     pos_list_itr itrn_end = pn.end();
00421 
00422     int ip = 0;
00423     uint32_t prev_dp = 0;
00424     int dp = next(bp, ip, sp, prev_dp);
00425     int dn = *itrn;
00426 
00427     while (1) {
00428       if (dp == dn) {
00429         pos_list_t tp = _intersect(bp, ip , sp, itrn, len);
00430         if (!tp.empty()) {
00431           pl.push_back(dp);
00432           pl.push_back(tp.size());
00433           pos_list_itr itr_end = tp.end();
00434           for (pos_list_itr itr = tp.begin();
00435                itr != itr_end; ++itr) {
00436             pl.push_back(*itr);
00437           }
00438         }
00439         dp = next(bp, ip, sp, prev_dp);
00440         dn = next(&itrn, itrn_end);
00441       } else if (dp > dn) {
00442         dn = next(&itrn, itrn_end);
00443       } else {
00444         skip(bp, ip, sp, next(bp, ip, sp));
00445         dp = next(bp, ip, sp, prev_dp);
00446       }
00447 
00448       if ((ip == sp && itrn == itrn_end) ||
00449           (ip == sp && dn > dp) || 
00450           (itrn == itrn_end && dp > dn)) {
00451           break;
00452       }
00453     }
00454     return pl;
00455   }
00456 
00457   // intersection between pos_list_t and uint8_t* array for positions
00458   static inline pos_list_t
00459   _intersect_nodiff(pos_list_itr itr, uint8_t *bn, int &in, int sn, int len)
00460   {
00461     pos_list_t pl;
00462     //pl.reserve(128); // for eliminating unnecessary copies
00463 
00464     int np = *(++itr) - 1;
00465     int nn = next(bn, in, sn) - 1;
00466     int pp = *(++itr);
00467     int pn = next(bn, in, sn);
00468 
00469     while (1) {
00470       if (pp + len == pn) {
00471         pl.push_back(pp);
00472         pp = next(&itr, &np);
00473         if (nn > 0) { pn = next(bn, in, sn); }
00474         --nn;
00475       } else if (pp + len > pn) {
00476         if (nn > 0) { pn = next(bn, in, sn); }
00477         --nn;
00478       } else {
00479         pp = next(&itr, &np);
00480       }
00481 
00482       if (np < 0 && nn < 0) break;
00483       if (np < 0 && pn > pp + len) {
00484         if (nn > 0) { skip(bn, in, sn, nn); }
00485         break;
00486       }
00487       if (nn < 0 && pp + len > pn) {
00488         break;  
00489       }
00490     }
00491     return pl;
00492   }
00493 
00494   // intersection between pos_list_t and uint8_t* array for positions
00495   static inline pos_list_t
00496   _intersect(pos_list_itr itr, uint8_t *bn, int &in, int sn, int len)
00497   {
00498     pos_list_t pl;
00499     //pl.reserve(128); // for eliminating unnecessary copies
00500 
00501     int np = *(++itr) - 1;
00502     int nn = next(bn, in, sn) - 1;
00503     int pp = *(++itr);
00504     int pn = next(bn, in, sn);
00505 
00506     while (1) {
00507       if (pp + len == pn) {
00508         pl.push_back(pp);
00509         pp = next(&itr, &np);
00510         if (nn > 0) { pn += next(bn, in, sn); }
00511         --nn;
00512       } else if (pp + len > pn) {
00513         if (nn > 0) { pn += next(bn, in, sn); }
00514         --nn;
00515       } else {
00516         pp = next(&itr, &np);
00517       }
00518 
00519       if (np < 0 && nn < 0) break;
00520       if (np < 0 && pn > pp + len) {
00521         if (nn > 0) { skip(bn, in, sn, nn); }
00522         break;
00523       }
00524       if (nn < 0 && pp + len > pn) {
00525         break;  
00526       }
00527     }
00528     return pl;
00529   }
00530 
00531   // intersection between pos_list_t and uint8_t* array for docid
00532   static inline 
00533   pos_list_t intersect_nodiff(pos_list_t &pp, uint8_t *bn, uint32_t sn, int len)
00534   {
00535     pos_list_t pl;
00536     if (pp.size() == 0 || sn == 0) { return pl; }
00537     //pl.reserve(pp.size() > sn ? pp.size() : sn);
00538 
00539     pos_list_itr itrp = pp.begin();
00540     pos_list_itr itrp_end = pp.end();
00541 
00542     int in = 0;
00543     int dp = *itrp;
00544     int dn = next(bn, in, sn);
00545 
00546     while (1) {
00547       if (dp == dn) {
00548         pos_list_t tp = _intersect_nodiff(itrp, bn, in , sn, len);
00549         if (!tp.empty()) {
00550           pl.push_back(dp);
00551           pl.push_back(tp.size());
00552           pos_list_itr itr_end = tp.end();
00553           for (pos_list_itr itr = tp.begin();
00554                itr != itr_end; ++itr) {
00555             pl.push_back(*itr);
00556           }
00557         }
00558         dp = next(&itrp, itrp_end);
00559         dn = next(bn, in, sn);
00560       } else if (dp > dn) {
00561         skip(bn, in, sn, next(bn, in, sn));
00562         dn = next(bn, in, sn);
00563       } else {
00564         dp = next(&itrp, itrp_end);
00565       }
00566 
00567       if ((itrp == itrp_end && in == sn) ||
00568           (itrp == itrp_end && dn > dp) || 
00569           (in == sn && dp > dn)) {
00570           break;
00571       }
00572     }
00573     return pl;
00574   }
00575 
00576   // intersection between pos_list_t and uint8_t* array for docid
00577   static inline 
00578   pos_list_t intersect(pos_list_t &pp, uint8_t *bn, uint32_t sn, int len)
00579   {
00580     pos_list_t pl;
00581     if (pp.size() == 0 || sn == 0) { return pl; }
00582     //pl.reserve(pp.size() > sn ? pp.size() : sn);
00583 
00584     pos_list_itr itrp = pp.begin();
00585     pos_list_itr itrp_end = pp.end();
00586 
00587     int in = 0;
00588     int dp = *itrp;
00589     uint32_t prev_dn = 0;
00590     int dn = next(bn, in, sn, prev_dn);
00591 
00592     while (1) {
00593       if (dp == dn) {
00594         pos_list_t tp = _intersect(itrp, bn, in , sn, len);
00595         if (!tp.empty()) {
00596           pl.push_back(dp);
00597           pl.push_back(tp.size());
00598           pos_list_itr itr_end = tp.end();
00599           for (pos_list_itr itr = tp.begin();
00600                itr != itr_end; ++itr) {
00601             pl.push_back(*itr);
00602           }
00603         }
00604         dp = next(&itrp, itrp_end);
00605         dn = next(bn, in, sn, prev_dn);
00606       } else if (dp > dn) {
00607         skip(bn, in, sn, next(bn, in, sn));
00608         dn = next(bn, in, sn, prev_dn);
00609       } else {
00610         dp = next(&itrp, itrp_end);
00611       }
00612 
00613       if ((itrp == itrp_end && in == sn) ||
00614           (itrp == itrp_end && dn > dp) || 
00615           (in == sn && dp > dn)) {
00616           break;
00617       }
00618     }
00619     return pl;
00620   }
00621 
00622 }
00623 
00624 #endif

Generated on Fri Feb 5 15:50:30 2010 for Lux by  doxygen 1.4.7