00001 #ifndef LUX_INDEX_VBC_H
00002 #define LUX_INDEX_VBC_H
00003
00004 #include <vector>
00005 #include <deque>
00006 #include <stdint.h>
00007
00008
00009 #include <boost/pool/pool_alloc.hpp>
00010
00011 namespace Lux {
00012
00013
00014
00015
00016 typedef std::deque<uint32_t> ui32_vec_t;
00017
00018 static inline
00019 void vb_encode_num(int n, std::vector<uint8_t> &bytestream)
00020 {
00021 std::vector<uint8_t> bytes;
00022 bytes.reserve(8);
00023 while (true) {
00024 unsigned char rem = n % 128;
00025 bytes.push_back(rem);
00026 if (n < 128) break;
00027 n /= 128;
00028 }
00029 bytes[0] += 128;
00030 size_t size = bytes.size();
00031 for (int i = size - 1; i >= 0; --i) {
00032 bytestream.push_back(bytes[i]);
00033 }
00034 }
00035
00036 static inline
00037 std::vector<uint8_t> vb_encode_nodiff(std::vector<uint32_t> &numbers)
00038 {
00039 std::vector<uint8_t> bytestream;
00040 std::vector<uint32_t>::iterator itr = numbers.begin();
00041 std::vector<uint32_t>::iterator itr_end = numbers.end();
00042 for (; itr != itr_end; ++itr) {
00043 vb_encode_num(*itr, bytestream);
00044 }
00045 return bytestream;
00046 }
00047
00048 static inline
00049 std::vector<uint8_t> vb_encode_docdiff(std::vector<uint32_t> &numbers)
00050 {
00051 std::vector<uint8_t> bytestream;
00052 std::vector<uint32_t>::iterator itr = numbers.begin();
00053 std::vector<uint32_t>::iterator itr_end = numbers.end();
00054 int num_postings = 0;
00055
00056 for (; itr != itr_end; ++itr) {
00057 if (num_postings == 0) {
00058 vb_encode_num(*itr, bytestream);
00059 num_postings = *(++itr);
00060 vb_encode_num(num_postings, bytestream);
00061 } else {
00062 int prev_pos = 0;
00063 while (itr != itr_end) {
00064 vb_encode_num(*itr-prev_pos, bytestream);
00065 prev_pos = *itr;
00066 if (--num_postings > 0) {
00067 ++itr;
00068 } else {
00069 break;
00070 }
00071 }
00072 }
00073 }
00074 return bytestream;
00075 }
00076
00077
00078 static inline
00079
00080 std::vector<uint8_t> vb_encode(ui32_vec_t &numbers)
00081 {
00082 std::vector<uint8_t> bytestream;
00083
00084 ui32_vec_t::iterator itr = numbers.begin();
00085
00086 ui32_vec_t::iterator itr_end = numbers.end();
00087 int prev_doc_id = 0;
00088 int doc_id_gap;
00089 int num_postings = 0;
00090
00091 int i = 0;
00092 for (; itr != itr_end; ++itr) {
00093 if (num_postings == 0) {
00094 ++i;
00095 doc_id_gap = *itr - prev_doc_id;
00096 prev_doc_id = *itr;
00097 vb_encode_num(doc_id_gap, bytestream);
00098 num_postings = *(++itr);
00099 vb_encode_num(num_postings, bytestream);
00100 } else {
00101 int prev_pos = 0;
00102 while (itr != itr_end) {
00103 vb_encode_num(*itr-prev_pos, bytestream);
00104 prev_pos = *itr;
00105 if (--num_postings > 0) {
00106 ++itr;
00107 } else {
00108 break;
00109 }
00110 }
00111 }
00112 }
00113 return bytestream;
00114 }
00115
00116 static inline
00117 std::vector<uint32_t> vb_decode_nodiff(uint8_t *bytestream, size_t size, bool skip_pos=false)
00118 {
00119 int n = 0;
00120 std::vector<uint32_t> array;
00121 array.reserve(5*size);
00122
00123 for (int i = 0; i < size; ++i) {
00124 n = n * 128 + bytestream[i];
00125 if (bytestream[i] >= 128) {
00126 array.push_back(n-128);
00127 if (skip_pos) {
00128 skip(bytestream, i , size, next(bytestream, ++i, size));
00129 }
00130 n = 0;
00131 }
00132 }
00133 return array;
00134 }
00135
00136 static inline
00137 std::vector<uint32_t> vb_decode_docdiff(uint8_t *bytestream, size_t size, bool skip_pos=false)
00138 {
00139 int n = 0;
00140 std::vector<uint32_t> array;
00141 array.reserve(5*size);
00142
00143 for (int i = 0; i < size; ++i) {
00144 n = n * 128 + bytestream[i];
00145 if (bytestream[i] >= 128) {
00146 array.push_back(n-128);
00147 n = 0;
00148 if (skip_pos) {
00149 skip(bytestream, i , size, next(bytestream, ++i, size));
00150 } else {
00151 uint32_t num_postings = next(bytestream, ++i, size);
00152 array.push_back(num_postings);
00153 uint32_t prev_pos = 0;
00154 for (int j = 0; j < num_postings && i < size; ++i) {
00155 n = n * 128 + bytestream[i];
00156 if (bytestream[i] >= 128) {
00157 array.push_back(n - 128 + prev_pos);
00158 prev_pos = n - 128 + prev_pos;
00159 n = 0;
00160 ++j;
00161 }
00162 }
00163 }
00164 --i;
00165 }
00166 }
00167 return array;
00168 }
00169
00170
00171 static inline
00172
00173 ui32_vec_t vb_decode(uint8_t *bytestream, size_t size, bool skip_pos=false)
00174 {
00175 int n = 0;
00176
00177 ui32_vec_t array;
00178
00179 uint32_t prev_doc_id = 0;
00180
00181 for (int i = 0; i < size; ++i) {
00182 if (bytestream[i] == 128 && n == 0) {
00183 prev_doc_id = 0;
00184 n = 0;
00185 continue;
00186 }
00187 n = n * 128 + bytestream[i];
00188 if (bytestream[i] >= 128) {
00189 array.push_back(n - 128 + prev_doc_id);
00190 prev_doc_id = n - 128 + prev_doc_id;
00191 n = 0;
00192 if (skip_pos) {
00193 skip(bytestream, i , size, next(bytestream, ++i, size));
00194 } else {
00195 uint32_t num_postings = next(bytestream, ++i, size);
00196 array.push_back(num_postings);
00197 uint32_t prev_pos = 0;
00198 for (int j = 0; j < num_postings && i < size; ++i) {
00199 n = n * 128 + bytestream[i];
00200 if (bytestream[i] >= 128) {
00201 array.push_back(n - 128 + prev_pos);
00202 prev_pos = n - 128 + prev_pos;
00203 n = 0;
00204 ++j;
00205 }
00206 }
00207 }
00208 --i;
00209 }
00210 }
00211 return array;
00212 }
00213 }
00214
00215 #endif