lux/index/vbc.h

00001 #ifndef LUX_INDEX_VBC_H
00002 #define LUX_INDEX_VBC_H
00003 
00004 #include <vector>
00005 #include <deque>
00006 #include <stdint.h>
00007 
00008 // test
00009 #include <boost/pool/pool_alloc.hpp>
00010 
00011 namespace Lux {
00012   
00013   //typedef std::vector< uint32_t,
00014   //                     boost::pool_allocator<uint32_t> > ui32_vec_t;
00015   //typedef std::vector<uint32_t> ui32_vec_t;
00016   typedef std::deque<uint32_t> ui32_vec_t;
00017 
00018   static inline
00019   void vb_encode_num(int n, std::vector<uint8_t> &bytestream)
00020   {
00021     std::vector<uint8_t> bytes;
00022     bytes.reserve(8);
00023     while (true) {
00024       unsigned char rem = n % 128;
00025       bytes.push_back(rem);
00026       if (n < 128) break;
00027       n /= 128;
00028     }
00029     bytes[0] += 128;
00030     size_t size = bytes.size();
00031     for (int i = size - 1; i >= 0; --i) {
00032       bytestream.push_back(bytes[i]); 
00033     }
00034   }
00035 
00036   static inline
00037   std::vector<uint8_t> vb_encode_nodiff(std::vector<uint32_t> &numbers)
00038   {
00039     std::vector<uint8_t> bytestream;
00040     std::vector<uint32_t>::iterator itr = numbers.begin();
00041     std::vector<uint32_t>::iterator itr_end = numbers.end();
00042     for (; itr != itr_end; ++itr) {
00043       vb_encode_num(*itr, bytestream);
00044     }
00045     return bytestream;
00046   }
00047 
00048   static inline
00049   std::vector<uint8_t> vb_encode_docdiff(std::vector<uint32_t> &numbers)
00050   {
00051     std::vector<uint8_t> bytestream;
00052     std::vector<uint32_t>::iterator itr = numbers.begin();
00053     std::vector<uint32_t>::iterator itr_end = numbers.end();
00054     int num_postings = 0;
00055 
00056     for (; itr != itr_end; ++itr) {
00057       if (num_postings == 0) {
00058         vb_encode_num(*itr, bytestream);
00059         num_postings = *(++itr);
00060         vb_encode_num(num_postings, bytestream);
00061       } else {
00062         int prev_pos = 0;
00063         while (itr != itr_end) {
00064           vb_encode_num(*itr-prev_pos, bytestream);
00065           prev_pos = *itr;
00066           if (--num_postings > 0) {
00067             ++itr;
00068           } else {
00069             break;
00070           }
00071         }
00072       }
00073     }
00074     return bytestream;
00075   }
00076 
00077   // handle both doc and position differences
00078   static inline
00079   //std::vector<uint8_t> vb_encode(std::vector<uint32_t> &numbers)
00080   std::vector<uint8_t> vb_encode(ui32_vec_t &numbers)
00081   {
00082     std::vector<uint8_t> bytestream;
00083     //std::vector<uint32_t>::iterator itr = numbers.begin();
00084     ui32_vec_t::iterator itr = numbers.begin();
00085     //std::vector<uint32_t>::iterator itr_end = numbers.end();
00086     ui32_vec_t::iterator itr_end = numbers.end();
00087     int prev_doc_id = 0;
00088     int doc_id_gap;
00089     int num_postings = 0;
00090 
00091     int i = 0;
00092     for (; itr != itr_end; ++itr) {
00093       if (num_postings == 0) {
00094         ++i;
00095         doc_id_gap = *itr - prev_doc_id;
00096         prev_doc_id = *itr;
00097         vb_encode_num(doc_id_gap, bytestream);
00098         num_postings = *(++itr);
00099         vb_encode_num(num_postings, bytestream);
00100       } else {
00101         int prev_pos = 0;
00102         while (itr != itr_end) {
00103           vb_encode_num(*itr-prev_pos, bytestream);
00104           prev_pos = *itr;
00105           if (--num_postings > 0) {
00106             ++itr;
00107           } else {
00108             break;
00109           }
00110         }
00111       }
00112     }
00113     return bytestream;
00114   }
00115 
00116   static inline
00117   std::vector<uint32_t> vb_decode_nodiff(uint8_t *bytestream, size_t size, bool skip_pos=false)
00118   {
00119     int n = 0;
00120     std::vector<uint32_t> array;
00121     array.reserve(5*size);
00122 
00123     for (int i = 0; i < size; ++i) {
00124       n = n * 128 + bytestream[i];
00125       if (bytestream[i] >= 128) {
00126         array.push_back(n-128);
00127         if (skip_pos) {
00128           skip(bytestream, i , size, next(bytestream, ++i, size));
00129         }
00130         n = 0;
00131       }
00132     }
00133     return array;
00134   }
00135 
00136   static inline
00137   std::vector<uint32_t> vb_decode_docdiff(uint8_t *bytestream, size_t size, bool skip_pos=false)
00138   {
00139     int n = 0;
00140     std::vector<uint32_t> array;
00141     array.reserve(5*size);
00142 
00143     for (int i = 0; i < size; ++i) {
00144       n = n * 128 + bytestream[i];
00145       if (bytestream[i] >= 128) {
00146         array.push_back(n-128);
00147         n = 0;
00148         if (skip_pos) {
00149           skip(bytestream, i , size, next(bytestream, ++i, size));
00150         } else {
00151           uint32_t num_postings = next(bytestream, ++i, size);
00152           array.push_back(num_postings);
00153           uint32_t prev_pos = 0;
00154           for (int j = 0; j < num_postings && i < size; ++i) {
00155             n = n * 128 + bytestream[i];
00156             if (bytestream[i] >= 128) {
00157               array.push_back(n - 128 + prev_pos);
00158               prev_pos = n - 128 + prev_pos;
00159               n = 0;
00160               ++j;
00161             }
00162           }
00163         }
00164         --i;
00165       }
00166     }
00167     return array;
00168   }
00169 
00170   // handle both doc and position differences
00171   static inline
00172   //std::vector<uint32_t> vb_decode(uint8_t *bytestream, size_t size, bool skip_pos=false)
00173   ui32_vec_t vb_decode(uint8_t *bytestream, size_t size, bool skip_pos=false)
00174   {
00175     int n = 0;
00176     //std::vector<uint32_t> array;
00177     ui32_vec_t array;
00178     //array.reserve(5*size);
00179     uint32_t prev_doc_id = 0;
00180 
00181     for (int i = 0; i < size; ++i) {
00182       if (bytestream[i] == 128 && n == 0) {
00183         prev_doc_id = 0;
00184         n = 0;
00185         continue;
00186       }
00187       n = n * 128 + bytestream[i];
00188       if (bytestream[i] >= 128) {
00189         array.push_back(n - 128 + prev_doc_id);
00190         prev_doc_id = n - 128 + prev_doc_id;
00191         n = 0;
00192         if (skip_pos) {
00193           skip(bytestream, i , size, next(bytestream, ++i, size));
00194         } else {
00195           uint32_t num_postings = next(bytestream, ++i, size);
00196           array.push_back(num_postings);
00197           uint32_t prev_pos = 0;
00198           for (int j = 0; j < num_postings && i < size; ++i) {
00199             n = n * 128 + bytestream[i];
00200             if (bytestream[i] >= 128) {
00201               array.push_back(n - 128 + prev_pos);
00202               prev_pos = n - 128 + prev_pos;
00203               n = 0;
00204               ++j;
00205             }
00206           }
00207         }
00208         --i;
00209       }
00210     }
00211     return array;
00212   }
00213 }
00214 
00215 #endif

Generated on Fri Feb 5 15:50:30 2010 for Lux by  doxygen 1.4.7