-
Notifications
You must be signed in to change notification settings - Fork 3
A GPU accelerated find() method
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Feb 15, 2021
·
18 revisions
VirtualMultiArray has a find(T object, S objectMember, int maxIndices) method that searches for objects with same member value (same byte representation) and returns a std::vector containing their indices (but with maxIndices elements at most, if found).
- Search value (
S objectMember) needs to be in same parentT objectfor proper calculations. Using different parent objects for both parameters leads to errors. - If an array member (like
struct test{ int data[100]; }) needs to be searched for exact byte representation, then it needs to be inside another struct (likestruct search_layer { int data[100]; }); struct test { search_layer };and that struct can be used for searching. - If no elements are found, returned vector size is 0.
- Multiple index order is not guaranteed to be same between repeated searches. OpenCL kernel finds elements and saves results using atomic functions. This lets find() complete quicker than a reduction algorithm when there are only few indices to find.
- Since finding 1 element has nearly same timing with finding 100 elements, it can beat std::find for big arrays, without even using RAM bandwidth(nor CPU cache bandwidth).
It's not very accelerated currently(14/02/2021) but it can do this at "nearly-constant" time:
preparing virtual array...
initializing data...
benchmarking find()...
test object: 5
found object at index: 5
data search: 40MB 358ms
data: 5
----------------------
----------------------
----------------------
test object: 100005
found object at index: 100005
data search: 40MB 6ms
data: 100005
----------------------
----------------------
----------------------
test object: 200005
found object at index: 200005
data search: 40MB 6ms
data: 200005
....
test object: 7800005
found object at index: 7800005
data search: 40MB 6ms
data: 7800005
----------------------
----------------------
----------------------
test object: 7900005
found object at index: 7900005
data search: 40MB 6ms
data: 7900005
----------------------
while std::find_if performance depends on position of element:
test object: 100005
found object at index: 100005
data search: 40MB 0ms
data: 100005
----------------------
----------------------
----------------------
test object: 200005
found object at index: 200005
data search: 40MB 1ms
data: 200005
...
test object: 7600005
found object at index: 7600005
data search: 40MB 16ms
data: 7600005
----------------------
----------------------
----------------------
test object: 7700005
found object at index: 7700005
data search: 40MB 15ms
data: 7700005
----------------------
Example:
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
// testing
#include <iostream>
#include "omp.h"
struct ArrayPassByValue
{
char buf[3];
};
class Obj
{
public:
Obj(){b=-1;}
Obj(int i){b=i;}
char a;
int b;
ArrayPassByValue arr;
char tmp;
};
int main(int argC, char ** argV)
{
std::cout<<"preparing virtual array..."<<std::endl;
size_t n = 10000000;
size_t p = 10000;
GraphicsCardSupplyDepot gpu;
VirtualMultiArray<Obj> arr(n,gpu.requestGpus(),p,3,{4,2,2});
std::cout<<"initializing data..."<<std::endl;
#pragma omp parallel for
for(int i=0;i<10;i++)
{
arr.mappedReadWriteAccess(i*(n/10),n/10,[&](Obj * ptr){
for(size_t j=i*(n/10);j<i*(n/10)+(n/10);j++)
{
Obj obj(j);
for(int k=0;k<3;k++)
obj.arr.buf[k]=' ';
obj.arr.buf[j%3]='X';
ptr[j]=obj;
}
},false,false,true);
}
std::cout<<"benchmarking find()..."<<std::endl;
Obj searchObj;
for(int i=0;i<80;i++)
{
std::chrono::milliseconds t1 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
searchObj.b=i*100000+5;
searchObj.arr.buf[0]='X';
searchObj.arr.buf[1]=' ';
searchObj.arr.buf[2]=' ';
// integer member as search value
// find at most 100 indices that has same member value
std::vector<size_t> found = arr.find(searchObj,searchObj.b,100);
// buffer member as search value
// find at most 100 indices that has same member value
//std::vector<size_t> found = arr.find(searchObj,searchObj.arr,100);
std::chrono::milliseconds t2 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
std::cout<<"test object: "<<i*100000+5<<std::endl;
std::cout<<"found object at index: "<<(found.size()>0?found[0]:-1)<<std::endl;
std::cout<<"data search: "<<n*sizeof(int)/1000000.0<<"MB "<<(t2.count()-t1.count())<<"ms"<<std::endl;
std::cout<<"data: "<<arr.get(found[0]).b<<std::endl;
// integer search error check
if(arr.get(found[0]).b!=i*100000+5)
{
std::cout<<"ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR"<<std::endl;
return 0;
}
std::cout<<"----------------------"<<std::endl;
std::cout<<"----------------------"<<std::endl;
std::cout<<"----------------------"<<std::endl;
}
return 0;
}
Here is another example with multiple indices found per find() call:
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
// testing
#include <iostream>
#include "omp.h"
#include <algorithm>
class Obj
{
public:
char c;
};
int main(int argC, char ** argV)
{
std::cout<<"preparing virtual array..."<<std::endl;
size_t n = 1000000000ull;
size_t p = 100000ull;
GraphicsCardSupplyDepot gpu;
VirtualMultiArray<Obj> arr(n,gpu.requestGpus(),p,3,{4,2,2});
std::cout<<"initializing data..."<<std::endl;
#pragma omp parallel for
for(size_t i=0;i<10;i++)
{
arr.mappedReadWriteAccess(i*(n/10),n/10,[&](Obj * ptr){
for(size_t j=i*(n/10);j<i*(n/10)+(n/10);j++)
{
Obj obj;
obj.c=' ';
ptr[j]=obj;
}
},false,false,true);
}
Obj searchObj;
searchObj.c='X';
arr.set(100,searchObj);
arr.set(1000,searchObj);
arr.set(10000,searchObj);
arr.set(100000,searchObj);
arr.set(1000000,searchObj);
arr.set(10000000,searchObj);
arr.set(100000000,searchObj);
arr.set(1000000000,searchObj);
std::cout<<"benchmarking find()..."<<std::endl;
for(int i=0;i<80;i++)
{
std::chrono::milliseconds t1 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
// find at most 100 indices that have elements with c='X'
std::vector<size_t> found = arr.find(searchObj,searchObj.c,100);
std::chrono::milliseconds t2 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
std::cout<<"num found: "<<found.size()<<std::endl;
if(found.size()>0)
{
std::cout<<"found: ";
for(int h=0;h<found.size();h++)
{
std::cout<<found[h]<<" ";
}
std::cout<<std::endl;
}
std::cout<<"data search: "<<n*sizeof(int)/1000000.0<<"MB "<<(t2.count()-t1.count())<<"ms"<<std::endl;
std::cout<<"consumed RAM bandwidth: "<<(found.size()*sizeof(size_t) + gpu.requestGpus().size()*101*sizeof(size_t))/((t2.count()-t1.count())/1000.0)/1000000.0<<"MB/s"<<std::endl;
std::cout<<"----------------------"<<std::endl;
std::cout<<"----------------------"<<std::endl;
std::cout<<"----------------------"<<std::endl;
}
return 0;
}
output:
num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000
data search: 4000MB 207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------
num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000
data search: 4000MB 207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------
num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000
data search: 4000MB 207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------