lsh_table.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. /***********************************************************************
  2. * Software License Agreement (BSD License)
  3. *
  4. * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
  5. * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
  6. *
  7. * THE BSD LICENSE
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. *
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *************************************************************************/
  30. /***********************************************************************
  31. * Author: Vincent Rabaud
  32. *************************************************************************/
  33. #ifndef OPENCV_FLANN_LSH_TABLE_H_
  34. #define OPENCV_FLANN_LSH_TABLE_H_
  35. //! @cond IGNORED
  36. #include <algorithm>
  37. #include <iostream>
  38. #include <iomanip>
  39. #include <limits.h>
  40. // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
  41. #ifdef __GXX_EXPERIMENTAL_CXX0X__
  42. # define USE_UNORDERED_MAP 1
  43. #else
  44. # define USE_UNORDERED_MAP 0
  45. #endif
  46. #if USE_UNORDERED_MAP
  47. #include <unordered_map>
  48. #else
  49. #include <map>
  50. #endif
  51. #include <math.h>
  52. #include <stddef.h>
  53. #include "dynamic_bitset.h"
  54. #include "matrix.h"
  55. #ifdef _MSC_VER
  56. #pragma warning(push)
  57. #pragma warning(disable: 4702) //disable unreachable code
  58. #endif
  59. namespace cvflann
  60. {
  61. namespace lsh
  62. {
  63. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  64. /** What is stored in an LSH bucket
  65. */
  66. typedef uint32_t FeatureIndex;
  67. /** The id from which we can get a bucket back in an LSH table
  68. */
  69. typedef unsigned int BucketKey;
  70. /** A bucket in an LSH table
  71. */
  72. typedef std::vector<FeatureIndex> Bucket;
  73. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  74. /** POD for stats about an LSH table
  75. */
  76. struct LshStats
  77. {
  78. std::vector<unsigned int> bucket_sizes_;
  79. size_t n_buckets_;
  80. size_t bucket_size_mean_;
  81. size_t bucket_size_median_;
  82. size_t bucket_size_min_;
  83. size_t bucket_size_max_;
  84. size_t bucket_size_std_dev;
  85. /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
  86. */
  87. std::vector<std::vector<unsigned int> > size_histogram_;
  88. };
  89. /** Overload the << operator for LshStats
  90. * @param out the streams
  91. * @param stats the stats to display
  92. * @return the streams
  93. */
  94. inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
  95. {
  96. int w = 20;
  97. out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
  98. << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
  99. << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
  100. << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
  101. << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
  102. << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
  103. << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
  104. // Display the histogram
  105. out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
  106. << std::setiosflags(std::ios::left);
  107. for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
  108. stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", ";
  109. return out;
  110. }
  111. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  112. /** Lsh hash table. As its key is a sub-feature, and as usually
  113. * the size of it is pretty small, we keep it as a continuous memory array.
  114. * The value is an index in the corpus of features (we keep it as an unsigned
  115. * int for pure memory reasons, it could be a size_t)
  116. */
  117. template<typename ElementType>
  118. class LshTable
  119. {
  120. public:
  121. /** A container of all the feature indices. Optimized for space
  122. */
  123. #if USE_UNORDERED_MAP
  124. typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
  125. #else
  126. typedef std::map<BucketKey, Bucket> BucketsSpace;
  127. #endif
  128. /** A container of all the feature indices. Optimized for speed
  129. */
  130. typedef std::vector<Bucket> BucketsSpeed;
  131. /** Default constructor
  132. */
  133. LshTable()
  134. {
  135. key_size_ = 0;
  136. feature_size_ = 0;
  137. speed_level_ = kArray;
  138. }
  139. /** Default constructor
  140. * Create the mask and allocate the memory
  141. * @param feature_size is the size of the feature (considered as a ElementType[])
  142. * @param key_size is the number of bits that are turned on in the feature
  143. */
  144. LshTable(unsigned int feature_size, unsigned int key_size)
  145. {
  146. feature_size_ = feature_size;
  147. CV_UNUSED(key_size);
  148. CV_Error(cv::Error::StsUnsupportedFormat, "LSH is not implemented for that type" );
  149. }
  150. /** Add a feature to the table
  151. * @param value the value to store for that feature
  152. * @param feature the feature itself
  153. */
  154. void add(unsigned int value, const ElementType* feature)
  155. {
  156. // Add the value to the corresponding bucket
  157. BucketKey key = (lsh::BucketKey)getKey(feature);
  158. switch (speed_level_) {
  159. case kArray:
  160. // That means we get the buckets from an array
  161. buckets_speed_[key].push_back(value);
  162. break;
  163. case kBitsetHash:
  164. // That means we can check the bitset for the presence of a key
  165. key_bitset_.set(key);
  166. buckets_space_[key].push_back(value);
  167. break;
  168. case kHash:
  169. {
  170. // That means we have to check for the hash table for the presence of a key
  171. buckets_space_[key].push_back(value);
  172. break;
  173. }
  174. }
  175. }
  176. /** Add a set of features to the table
  177. * @param dataset the values to store
  178. */
  179. void add(Matrix<ElementType> dataset)
  180. {
  181. #if USE_UNORDERED_MAP
  182. buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
  183. #endif
  184. // Add the features to the table
  185. for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
  186. // Now that the table is full, optimize it for speed/space
  187. optimize();
  188. }
  189. /** Get a bucket given the key
  190. * @param key
  191. * @return
  192. */
  193. inline const Bucket* getBucketFromKey(BucketKey key) const
  194. {
  195. // Generate other buckets
  196. switch (speed_level_) {
  197. case kArray:
  198. // That means we get the buckets from an array
  199. return &buckets_speed_[key];
  200. break;
  201. case kBitsetHash:
  202. // That means we can check the bitset for the presence of a key
  203. if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
  204. else return 0;
  205. break;
  206. case kHash:
  207. {
  208. // That means we have to check for the hash table for the presence of a key
  209. BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
  210. bucket_it = buckets_space_.find(key);
  211. // Stop here if that bucket does not exist
  212. if (bucket_it == bucket_end) return 0;
  213. else return &bucket_it->second;
  214. break;
  215. }
  216. }
  217. return 0;
  218. }
  219. /** Compute the sub-signature of a feature
  220. */
  221. size_t getKey(const ElementType* /*feature*/) const
  222. {
  223. CV_Error(cv::Error::StsUnsupportedFormat, "LSH is not implemented for that type" );
  224. return 0;
  225. }
  226. /** Get statistics about the table
  227. * @return
  228. */
  229. LshStats getStats() const;
  230. private:
  231. /** defines the speed fo the implementation
  232. * kArray uses a vector for storing data
  233. * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
  234. * kHash uses a hash map only
  235. */
  236. enum SpeedLevel
  237. {
  238. kArray, kBitsetHash, kHash
  239. };
  240. /** Initialize some variables
  241. */
  242. void initialize(size_t key_size)
  243. {
  244. const size_t key_size_lower_bound = 1;
  245. //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
  246. const size_t key_size_upper_bound = (std::min)(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
  247. if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
  248. {
  249. CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
  250. }
  251. speed_level_ = kHash;
  252. key_size_ = (unsigned)key_size;
  253. }
  254. /** Optimize the table for speed/space
  255. */
  256. void optimize()
  257. {
  258. // If we are already using the fast storage, no need to do anything
  259. if (speed_level_ == kArray) return;
  260. // Use an array if it will be more than half full
  261. if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
  262. speed_level_ = kArray;
  263. // Fill the array version of it
  264. buckets_speed_.resize(size_t(1) << key_size_);
  265. for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
  266. // Empty the hash table
  267. buckets_space_.clear();
  268. return;
  269. }
  270. // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
  271. // for the vector) or less than 512MB (key_size_ <= 30)
  272. if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
  273. >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
  274. speed_level_ = kBitsetHash;
  275. key_bitset_.resize(size_t(1) << key_size_);
  276. key_bitset_.reset();
  277. // Try with the BucketsSpace
  278. for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
  279. }
  280. else {
  281. speed_level_ = kHash;
  282. key_bitset_.clear();
  283. }
  284. }
  285. /** The vector of all the buckets if they are held for speed
  286. */
  287. BucketsSpeed buckets_speed_;
  288. /** The hash table of all the buckets in case we cannot use the speed version
  289. */
  290. BucketsSpace buckets_space_;
  291. /** What is used to store the data */
  292. SpeedLevel speed_level_;
  293. /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
  294. * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
  295. */
  296. DynamicBitset key_bitset_;
  297. /** The size of the sub-signature in bits
  298. */
  299. unsigned int key_size_;
  300. unsigned int feature_size_;
  301. // Members only used for the unsigned char specialization
  302. /** The mask to apply to a feature to get the hash key
  303. * Only used in the unsigned char case
  304. */
  305. std::vector<size_t> mask_;
  306. };
  307. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  308. // Specialization for unsigned char
  309. template<>
  310. inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
  311. {
  312. feature_size_ = feature_size;
  313. initialize(subsignature_size);
  314. // Allocate the mask
  315. mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0);
  316. // A bit brutal but fast to code
  317. std::vector<int> indices(feature_size * CHAR_BIT);
  318. for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = (int)i;
  319. #ifndef OPENCV_FLANN_USE_STD_RAND
  320. cv::randShuffle(indices);
  321. #else
  322. std::random_shuffle(indices.begin(), indices.end());
  323. #endif
  324. // Generate a random set of order of subsignature_size_ bits
  325. for (unsigned int i = 0; i < key_size_; ++i) {
  326. size_t index = indices[i];
  327. // Set that bit in the mask
  328. size_t divisor = CHAR_BIT * sizeof(size_t);
  329. size_t idx = index / divisor; //pick the right size_t index
  330. mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
  331. }
  332. // Set to 1 if you want to display the mask for debug
  333. #if 0
  334. {
  335. size_t bcount = 0;
  336. BOOST_FOREACH(size_t mask_block, mask_){
  337. out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
  338. << std::endl;
  339. bcount += __builtin_popcountll(mask_block);
  340. }
  341. out << "bit count : " << std::dec << bcount << std::endl;
  342. out << "mask size : " << mask_.size() << std::endl;
  343. return out;
  344. }
  345. #endif
  346. }
  347. /** Return the Subsignature of a feature
  348. * @param feature the feature to analyze
  349. */
  350. template<>
  351. inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
  352. {
  353. // no need to check if T is dividable by sizeof(size_t) like in the Hamming
  354. // distance computation as we have a mask
  355. // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
  356. const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
  357. // Figure out the subsignature of the feature
  358. // Given the feature ABCDEF, and the mask 001011, the output will be
  359. // 000CEF
  360. size_t subsignature = 0;
  361. size_t bit_index = 1;
  362. for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) {
  363. // get the mask and signature blocks
  364. size_t feature_block;
  365. if (i <= feature_size_ - sizeof(size_t))
  366. {
  367. feature_block = *feature_block_ptr;
  368. }
  369. else
  370. {
  371. size_t tmp = 0;
  372. memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
  373. feature_block = tmp;
  374. }
  375. size_t mask_block = mask_[i / sizeof(size_t)];
  376. while (mask_block) {
  377. // Get the lowest set bit in the mask block
  378. size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
  379. // Add it to the current subsignature if necessary
  380. subsignature += (feature_block & lowest_bit) ? bit_index : 0;
  381. // Reset the bit in the mask block
  382. mask_block ^= lowest_bit;
  383. // increment the bit index for the subsignature
  384. bit_index <<= 1;
  385. }
  386. // Check the next feature block
  387. ++feature_block_ptr;
  388. }
  389. return subsignature;
  390. }
  391. template<>
  392. inline LshStats LshTable<unsigned char>::getStats() const
  393. {
  394. LshStats stats;
  395. stats.bucket_size_mean_ = 0;
  396. if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
  397. stats.n_buckets_ = 0;
  398. stats.bucket_size_median_ = 0;
  399. stats.bucket_size_min_ = 0;
  400. stats.bucket_size_max_ = 0;
  401. return stats;
  402. }
  403. if (!buckets_speed_.empty()) {
  404. for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
  405. stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
  406. stats.bucket_size_mean_ += pbucket->size();
  407. }
  408. stats.bucket_size_mean_ /= buckets_speed_.size();
  409. stats.n_buckets_ = buckets_speed_.size();
  410. }
  411. else {
  412. for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
  413. stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
  414. stats.bucket_size_mean_ += x->second.size();
  415. }
  416. stats.bucket_size_mean_ /= buckets_space_.size();
  417. stats.n_buckets_ = buckets_space_.size();
  418. }
  419. std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
  420. // BOOST_FOREACH(int size, stats.bucket_sizes_)
  421. // std::cout << size << " ";
  422. // std::cout << std::endl;
  423. stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
  424. stats.bucket_size_min_ = stats.bucket_sizes_.front();
  425. stats.bucket_size_max_ = stats.bucket_sizes_.back();
  426. // TODO compute mean and std
  427. /*float mean, stddev;
  428. stats.bucket_size_mean_ = mean;
  429. stats.bucket_size_std_dev = stddev;*/
  430. // Include a histogram of the buckets
  431. unsigned int bin_start = 0;
  432. unsigned int bin_end = 20;
  433. bool is_new_bin = true;
  434. for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
  435. != end; )
  436. if (*iterator < bin_end) {
  437. if (is_new_bin) {
  438. stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
  439. stats.size_histogram_.back()[0] = bin_start;
  440. stats.size_histogram_.back()[1] = bin_end - 1;
  441. is_new_bin = false;
  442. }
  443. ++stats.size_histogram_.back()[2];
  444. ++iterator;
  445. }
  446. else {
  447. bin_start += 20;
  448. bin_end += 20;
  449. is_new_bin = true;
  450. }
  451. return stats;
  452. }
  453. // End the two namespaces
  454. }
  455. }
  456. #ifdef _MSC_VER
  457. #pragma warning(pop)
  458. #endif
  459. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  460. //! @endcond
  461. #endif /* OPENCV_FLANN_LSH_TABLE_H_ */