Skip to content

Commit 46699e6

Browse files
authored
Merge pull request #34 from Jaybro/kd_forest
kd_forest rework
2 parents 9ac48f3 + c05db63 commit 46699e6

30 files changed

+731
-414
lines changed

CMakeLists.txt

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils.cmake)
66

77
project(pico_tree
88
LANGUAGES CXX
9-
VERSION 0.8.1
9+
VERSION 0.8.2
1010
DESCRIPTION "PicoTree is a C++ header only library for fast nearest neighbor searches and range searches using a KdTree."
1111
HOMEPAGE_URL "https://github.com/Jaybro/pico_tree")
1212

@@ -23,16 +23,19 @@ add_subdirectory(src)
2323
# Ignored when running cmake from setup.py using scikit-build.
2424
if(NOT SKBUILD)
2525
option(BUILD_EXAMPLES "Enable the creation of PicoTree examples." ON)
26+
message(STATUS "BUILD_EXAMPLES: ${BUILD_EXAMPLES}")
2627

2728
if(BUILD_EXAMPLES)
2829
add_subdirectory(examples)
2930
endif()
3031

31-
include(CTest)
3232
find_package(GTest QUIET)
3333

34-
if(BUILD_TESTING)
35-
if(GTEST_FOUND)
34+
if(GTEST_FOUND)
35+
include(CTest)
36+
message(STATUS "BUILD_TESTING: ${BUILD_TESTING}")
37+
38+
if(BUILD_TESTING)
3639
# Tests are dependent on some common code.
3740
# For now, the understory is considered important enough to be tested.
3841
if(NOT TARGET pico_toolshed)
@@ -42,28 +45,24 @@ if(NOT SKBUILD)
4245

4346
enable_testing()
4447
add_subdirectory(test)
45-
message(STATUS "GTest found. Building unit tests.")
46-
else()
47-
message(STATUS "GTest not found. Unit tests will not be build.")
4848
endif()
49+
else()
50+
message(STATUS "GTest not found. Unit tests cannot be build.")
4951
endif()
5052

5153
find_package(Doxygen QUIET)
52-
option(BUILD_DOCS "Build documentation with Doxygen." ON)
5354

54-
if(BUILD_DOCS)
55-
if(DOXYGEN_FOUND)
56-
set(DOC_TARGET_NAME ${PROJECT_NAME}_doc)
55+
if(DOXYGEN_FOUND)
56+
set(DOC_TARGET_NAME ${PROJECT_NAME}_doc)
5757

58-
# Hide the internal namespace from the documentation.
59-
# set(DOXYGEN_EXCLUDE_SYMBOLS "internal")
60-
doxygen_add_docs(
61-
${DOC_TARGET_NAME}
62-
src/pico_tree)
58+
# Hide the internal namespace from the documentation.
59+
# set(DOXYGEN_EXCLUDE_SYMBOLS "internal")
60+
doxygen_add_docs(
61+
${DOC_TARGET_NAME}
62+
src/pico_tree)
6363

64-
message(STATUS "Doxygen found. To build the documentation: cmake --build . --target ${DOC_TARGET_NAME}")
65-
else()
66-
message(STATUS "Doxygen not found. Documentation cannot be build.")
67-
endif()
64+
message(STATUS "Doxygen found. To build the documentation: cmake --build . --target ${DOC_TARGET_NAME}")
65+
else()
66+
message(STATUS "Doxygen not found. Documentation cannot be build.")
6867
endif()
6968
endif()

README.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ PicoTree is a C++ header only library with [Python bindings](https://github.com/
1111
| [Scikit-learn KDTree][skkd] 1.2.2 | ... | 6.2s | ... | 42.2s |
1212
| [pykdtree][pykd] 1.3.7 | ... | 1.0s | ... | 6.6s |
1313
| [OpenCV FLANN][cvfn] 4.6.0 | 1.9s | ... | 4.7s | ... |
14-
| PicoTree KdTree v0.8.1 | 0.9s | 1.0s | 2.8s | 3.1s |
14+
| PicoTree KdTree v0.8.2 | 0.9s | 1.0s | 2.8s | 3.1s |
1515

1616
Two [LiDAR](./docs/benchmark.md) based point clouds of sizes 7733372 and 7200863 were used to generate these numbers. The first point cloud was the input to the build algorithm and the second to the query algorithm. All benchmarks were run on a single thread with the following parameters: `max_leaf_size=10` and `knn=1`. A more detailed [C++ comparison](./docs/benchmark.md) of PicoTree is available with respect to [nanoflann][nano].
1717

@@ -61,7 +61,7 @@ PicoTree can interface with different types of points and point sets through tra
6161
* Creating a [custom search visitor](./examples/kd_tree/kd_tree_custom_search_visitor.cpp).
6262
* [Saving and loading](./examples/kd_tree/kd_tree_save_and_load.cpp) a KdTree to and from a file.
6363
* Support for [Eigen](./examples/eigen/eigen.cpp) and [OpenCV](./examples/opencv/opencv.cpp) data types.
64-
* Running the KdTree on the [MNIST](./examples/mnist/mnist.cpp) [database](http://yann.lecun.com/exdb/mnist/).
64+
* [Running the KdTree and KdForest](./examples/kd_forest/kd_forest.cpp) on the [MNIST](http://yann.lecun.com/exdb/mnist/) and [SIFT](http://corpus-texmex.irisa.fr/) datasets.
6565
* How to use the [KdTree with Python](./examples/python/kd_tree.py).
6666

6767
# Requirements
@@ -113,9 +113,11 @@ $ pip install ./pico_tree
113113

114114
# References
115115

116-
* [Computational Geometry - Algorithms and Applications.](https://www.springer.com/gp/book/9783540779735) Mark de Berg, Otfried Cheong, Marc van Kreveld, and Mark Overmars, Springer-Verlag, third edition, 2008.
117-
* S. Maneewongvatana and D. M. Mount. [It's okay to be skinny, if your friends are fat.](http://www.cs.umd.edu/~mount/Papers/cgc99-smpack.pdf) 4th Annual CGC Workshop on Computational Geometry, 1999.
118-
* S. Arya and H. Y. Fu. [Expected-case complexity of approximate nearest neighbor searching.](https://www.cse.ust.hk/faculty/arya/pub/exp.pdf) InProceedings of the 11th ACM-SIAM Symposium on Discrete Algorithms, 2000.
119-
* S. Arya and D. M. Mount. [Algorithms for fast vector quantization.](https://www.cs.umd.edu/~mount/Papers/DCC.pdf) In IEEE Data Compression Conference, pages 381–390, March 1993.
120-
* N. Sample, M. Haines, M. Arnold and T. Purcell. [Optimizing Search Strategies in k-d Trees.](http://infolab.stanford.edu/~nsample/pubs/samplehaines.pdf) In: 5th WSES/IEEE World Multiconference on Circuits, Systems, Communications & Computers (CSCC 2001), July 2001.
121-
* A. Yershova and S. M. LaValle, [Improving Motion-Planning Algorithms by Efficient Nearest-Neighbor Searching.](http://msl.cs.uiuc.edu/~lavalle/papers/YerLav06.pdf) In IEEE Transactions on Robotics, vol. 23, no. 1, pp. 151-157, Feb. 2007.
116+
* J. L. Bentley, [Multidimensional binary search trees used for associative searching](https://dl.acm.org/doi/pdf/10.1145/361002.361007), Communications of the ACM, vol. 18, no. 9, pp. 509–517, 1975.
117+
* S. Arya and D. M. Mount, [Algorithms for fast vector quantization](https://www.cs.umd.edu/~mount/Papers/DCC.pdf), In IEEE Data Compression Conference, pp. 381–390, March 1993.
118+
* S. Maneewongvatana and D. M. Mount, [It's okay to be skinny, if your friends are fat](http://www.cs.umd.edu/~mount/Papers/cgc99-smpack.pdf), 4th Annual CGC Workshop on Computational Geometry, 1999.
119+
* S. Arya and H. Y. Fu, [Expected-case complexity of approximate nearest neighbor searching](https://www.cse.ust.hk/faculty/arya/pub/exp.pdf), InProceedings of the 11th ACM-SIAM Symposium on Discrete Algorithms, 2000.
120+
* N. Sample, M. Haines, M. Arnold and T. Purcell, [Optimizing Search Strategies in k-d Trees](http://infolab.stanford.edu/~nsample/pubs/samplehaines.pdf), In: 5th WSES/IEEE World Multiconference on Circuits, Systems, Communications & Computers (CSCC 2001), July 2001.
121+
* A. Yershova and S. M. LaValle, [Improving Motion-Planning Algorithms by Efficient Nearest-Neighbor Searching](http://msl.cs.uiuc.edu/~lavalle/papers/YerLav06.pdf), In IEEE Transactions on Robotics, vol. 23, no. 1, pp. 151-157, Feb. 2007.
122+
* M. de Berg, O. Cheong, M. van Kreveld, and M. Overmars, [Computational Geometry - Algorithms and Applications](https://www.springer.com/gp/book/9783540779735), Springer-Verlag, third edition, 2008.
123+
* C. Silpa-Anan and R. Hartley, [Optimised KD-trees for fast image descriptor matching](http://vigir.missouri.edu/~gdesouza/Research/Conference_CDs/IEEE_CVPR_2008/data/papers/298.pdf), In CVPR, 2008.

docs/benchmark.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ One of the PicoTree examples contains [benchmarks](../examples/benchmark/) of di
44

55
The results described in this document were generated on 29-08-2021 using MinGW GCC 10.3, PicoTree v0.7.4 and Nanoflann v1.3.2.
66

7-
Note: The performance of PicoTree v0.8.0 released on 30-6-2023 is identical to that of v0.7.4. However, the build algorithm of nanoflann v1.5.0 regressed and has become 90% slower.
7+
Note: The performance of PicoTree v0.8.2 released on 07-09-2023 is identical to that of v0.7.4. However, the build algorithm of nanoflann v1.5.0 regressed and has become 90% slower.
88

99
# Data sets
1010

examples/CMakeLists.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ add_subdirectory(pico_understory)
88

99
add_subdirectory(kd_tree)
1010

11+
add_subdirectory(kd_forest)
12+
1113
find_package(Eigen3 QUIET)
1214

1315
if(Eigen3_FOUND)
@@ -35,10 +37,6 @@ else()
3537
message(STATUS "benchmark not found. PicoTree benchmarks skipped.")
3638
endif()
3739

38-
if(Eigen3_FOUND)
39-
add_subdirectory(mnist)
40-
endif()
41-
4240
# The Python examples only get copied when the bindings module will be build.
4341
if(TARGET _pyco_tree)
4442
add_subdirectory(python)

examples/kd_forest/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
add_executable(kd_forest kd_forest.cpp)
2+
set_default_target_properties(kd_forest)
3+
target_link_libraries(kd_forest PUBLIC pico_toolshed pico_understory)

examples/kd_forest/kd_forest.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#include <iostream>
2+
#include <pico_toolshed/format/format_bin.hpp>
3+
#include <pico_toolshed/scoped_timer.hpp>
4+
#include <pico_tree/array_traits.hpp>
5+
#include <pico_tree/kd_tree.hpp>
6+
#include <pico_tree/vector_traits.hpp>
7+
#include <pico_understory/kd_forest.hpp>
8+
9+
#include "mnist.hpp"
10+
#include "sift.hpp"
11+
12+
// A KdForest takes roughly forest_size times longer to build compared to
13+
// building a KdTree. However, the KdForest is usually a lot faster with queries
14+
// in high dimensions with the added trade-off that the exact nearest neighbor
15+
// may not be found.
16+
template <typename Dataset>
17+
void RunDataset(
18+
std::size_t tree_max_leaf_size,
19+
std::size_t forest_size,
20+
std::size_t forest_max_leaf_size,
21+
std::size_t forest_max_leaves_visited) {
22+
using Point = typename Dataset::PointType;
23+
using Space = std::reference_wrapper<std::vector<Point>>;
24+
using Scalar = typename Point::value_type;
25+
26+
auto train = Dataset::ReadTrain();
27+
auto test = Dataset::ReadTest();
28+
std::size_t count = test.size();
29+
std::vector<pico_tree::Neighbor<int, Scalar>> nns(count);
30+
std::string fn_nns_gt = Dataset::kDatasetName + "_nns_gt.bin";
31+
32+
if (!std::filesystem::exists(fn_nns_gt)) {
33+
std::cout << "Creating " << fn_nns_gt
34+
<< " using the KdTree. Be *very* patient." << std::endl;
35+
36+
auto kd_tree = [&train, &tree_max_leaf_size]() {
37+
ScopedTimer t0("kd_tree build");
38+
return pico_tree::KdTree<Space>(train, tree_max_leaf_size);
39+
}();
40+
41+
{
42+
ScopedTimer t1("kd_tree query");
43+
for (std::size_t i = 0; i < nns.size(); ++i) {
44+
kd_tree.SearchNn(test[i], nns[i]);
45+
}
46+
}
47+
48+
pico_tree::WriteBin(fn_nns_gt, nns);
49+
} else {
50+
pico_tree::ReadBin(fn_nns_gt, nns);
51+
std::cout << "KdTree not created. Read " << fn_nns_gt << " instead."
52+
<< std::endl;
53+
}
54+
55+
std::size_t equal = 0;
56+
{
57+
auto rkd_tree = [&train, &forest_max_leaf_size, &forest_size]() {
58+
ScopedTimer t0("kd_forest build");
59+
return pico_tree::KdForest<Space>(
60+
train, forest_max_leaf_size, forest_size);
61+
}();
62+
63+
ScopedTimer t1("kd_forest query");
64+
pico_tree::Neighbor<int, Scalar> nn;
65+
for (std::size_t i = 0; i < nns.size(); ++i) {
66+
rkd_tree.SearchNn(test[i], forest_max_leaves_visited, nn);
67+
68+
if (nns[i].index == nn.index) {
69+
++equal;
70+
}
71+
}
72+
}
73+
74+
std::cout << "Precision: "
75+
<< (static_cast<float>(equal) / static_cast<float>(count))
76+
<< std::endl;
77+
}
78+
79+
int main() {
80+
// forest_max_leaf_size = 16
81+
// forest_max_leaves_visited = 16
82+
// forest_size 8: a precision of around 0.915.
83+
// forest_size 16: a precision of around 0.976.
84+
RunDataset<Mnist>(16, 8, 16, 16);
85+
// forest_max_leaf_size = 32
86+
// forest_max_leaves_visited = 64
87+
// forest_size 8: a precision of around 0.884.
88+
// forest_size 16: a precision of around 0.940.
89+
// forest_size 128: out of memory :'(
90+
RunDataset<Sift>(16, 8, 32, 64);
91+
return 0;
92+
}

examples/kd_forest/mnist.hpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <filesystem>
5+
#include <pico_toolshed/format/format_mnist.hpp>
6+
7+
template <typename U, typename T, std::size_t N>
8+
std::array<U, N> Cast(std::array<T, N> const& i) {
9+
std::array<U, N> c;
10+
std::transform(i.begin(), i.end(), c.begin(), [](T a) -> U {
11+
return static_cast<U>(a);
12+
});
13+
return c;
14+
}
15+
16+
template <typename U, typename T, std::size_t N>
17+
std::vector<std::array<U, N>> Cast(std::vector<std::array<T, N>> const& i) {
18+
std::vector<std::array<U, N>> c;
19+
std::transform(
20+
i.begin(),
21+
i.end(),
22+
std::back_inserter(c),
23+
[](std::array<T, N> const& a) -> std::array<U, N> { return Cast<U>(a); });
24+
return c;
25+
}
26+
27+
class Mnist {
28+
private:
29+
using Scalar = float;
30+
using ImageByte = std::array<std::byte, 28 * 28>;
31+
using ImageFloat = std::array<Scalar, 28 * 28>;
32+
33+
static std::vector<ImageFloat> ReadImages(std::string const& filename) {
34+
if (!std::filesystem::exists(filename)) {
35+
throw std::runtime_error(filename + " doesn't exist.");
36+
}
37+
38+
std::vector<ImageByte> images_u8;
39+
pico_tree::ReadMnistImages(filename, images_u8);
40+
return Cast<Scalar>(images_u8);
41+
}
42+
43+
public:
44+
using PointType = ImageFloat;
45+
46+
static std::string const kDatasetName;
47+
48+
static std::vector<PointType> ReadTrain() {
49+
std::string fn_images_train = "train-images.idx3-ubyte";
50+
return ReadImages(fn_images_train);
51+
}
52+
53+
static std::vector<PointType> ReadTest() {
54+
std::string fn_images_test = "t10k-images.idx3-ubyte";
55+
return ReadImages(fn_images_test);
56+
}
57+
};
58+
59+
std::string const Mnist::kDatasetName = "mnist";

examples/kd_forest/sift.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#pragma once
2+
3+
#include <filesystem>
4+
#include <pico_toolshed/format/format_xvecs.hpp>
5+
6+
class Sift {
7+
private:
8+
using VectorFloat = std::array<float, 128>;
9+
10+
static std::vector<VectorFloat> ReadVectors(std::string const& filename) {
11+
if (!std::filesystem::exists(filename)) {
12+
throw std::runtime_error(filename + " doesn't exist.");
13+
}
14+
15+
std::vector<VectorFloat> vectors;
16+
pico_tree::ReadXvecs(filename, vectors);
17+
return vectors;
18+
}
19+
20+
public:
21+
using PointType = VectorFloat;
22+
23+
static std::string const kDatasetName;
24+
25+
static std::vector<PointType> ReadTrain() {
26+
std::string fn_images_train = "sift_base.fvecs";
27+
return ReadVectors(fn_images_train);
28+
}
29+
30+
static std::vector<PointType> ReadTest() {
31+
std::string fn_images_test = "sift_query.fvecs";
32+
return ReadVectors(fn_images_test);
33+
}
34+
};
35+
36+
std::string const Sift::kDatasetName = "sift";

examples/mnist/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)