Skip to content

A GPU accelerated find() method

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Feb 15, 2021 · 18 revisions

VirtualMultiArray has a find(T object, S objectMember, int maxIndices) method that searches for objects with same member value (same byte representation) and returns a std::vector containing their indices (but with maxIndices elements at most, if found).

  • Search value (S objectMember) needs to be in same parent T object for proper calculations. Using different parent objects for both parameters leads to errors.
  • If an array member (like struct test{ int data[100]; }) needs to be searched for exact byte representation, then it needs to be inside another struct (like struct search_layer { int data[100]; }); struct test { search_layer }; and that struct can be used for searching.
  • If no elements are found, returned vector size is 0.
  • Multiple index order is not guaranteed to be same between repeated searches. OpenCL kernel finds elements and saves results using atomic functions. This lets find() complete quicker than a reduction algorithm when there are only few indices to find.
  • Since finding 1 element has nearly same timing with finding 100 elements, it can beat std::find for big arrays, without even using RAM bandwidth(nor CPU cache bandwidth).

It's not very accelerated currently(14/02/2021) but it can do this at "nearly-constant" time:

preparing virtual array...
initializing data...
benchmarking find()...
test object: 5
found object at index: 5
data search: 40MB         358ms
data: 5
----------------------
----------------------
----------------------
test object: 100005
found object at index: 100005
data search: 40MB         6ms
data: 100005
----------------------
----------------------
----------------------
test object: 200005
found object at index: 200005
data search: 40MB         6ms
data: 200005

....

test object: 7800005
found object at index: 7800005
data search: 40MB         6ms
data: 7800005
----------------------
----------------------
----------------------
test object: 7900005
found object at index: 7900005
data search: 40MB         6ms
data: 7900005
----------------------


while std::find_if performance depends on position of element:

test object: 100005
found object at index: 100005
data search: 40MB         0ms
data: 100005
----------------------
----------------------
----------------------
test object: 200005
found object at index: 200005
data search: 40MB         1ms
data: 200005

...

test object: 7600005
found object at index: 7600005
data search: 40MB         16ms
data: 7600005
----------------------
----------------------
----------------------
test object: 7700005
found object at index: 7700005
data search: 40MB         15ms
data: 7700005
----------------------

Example:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"

struct ArrayPassByValue
{
	char buf[3];
};

class Obj
{
public:
	Obj(){b=-1;}
	Obj(int i){b=i;}
	char a;
	int b;
	ArrayPassByValue arr;
	char tmp;
};


int main(int argC, char ** argV)
{


	std::cout<<"preparing virtual array..."<<std::endl;
	size_t n = 10000000;
	size_t p = 10000;
	GraphicsCardSupplyDepot gpu;
	VirtualMultiArray<Obj> arr(n,gpu.requestGpus(),p,3,{4,2,2});

	std::cout<<"initializing data..."<<std::endl;
	#pragma omp parallel for
	for(int i=0;i<10;i++)
	{
		arr.mappedReadWriteAccess(i*(n/10),n/10,[&](Obj * ptr){
			for(size_t j=i*(n/10);j<i*(n/10)+(n/10);j++)
			{
				Obj obj(j);
				for(int k=0;k<3;k++)
					obj.arr.buf[k]=' ';
				obj.arr.buf[j%3]='X';
				ptr[j]=obj;
			}
		},false,false,true);
	}
	std::cout<<"benchmarking find()..."<<std::endl;

	Obj searchObj;

	for(int i=0;i<80;i++)
	{
		std::chrono::milliseconds t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
		searchObj.b=i*100000+5;
		searchObj.arr.buf[0]='X';
		searchObj.arr.buf[1]=' ';
		searchObj.arr.buf[2]=' ';

		// integer member as search value
		// find at most 100 indices that has same member value
		std::vector<size_t> found = arr.find(searchObj,searchObj.b,100);

		// buffer member as search value
		// find at most 100 indices that has same member value
		//std::vector<size_t> found = arr.find(searchObj,searchObj.arr,100);

		std::chrono::milliseconds t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
		std::cout<<"test object: "<<i*100000+5<<std::endl;
		std::cout<<"found object at index: "<<(found.size()>0?found[0]:-1)<<std::endl;
		std::cout<<"data search: "<<n*sizeof(int)/1000000.0<<"MB         "<<(t2.count()-t1.count())<<"ms"<<std::endl;
		std::cout<<"data: "<<arr.get(found[0]).b<<std::endl;

		// integer search error check
		if(arr.get(found[0]).b!=i*100000+5)
		{
			std::cout<<"ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR"<<std::endl;
			return 0;
		}
		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;

	}
	return 0;
}

Here is another example with multiple indices found per find() call:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"
#include <algorithm>



class Obj
{
public:
	char c;
};


int main(int argC, char ** argV)
{


	std::cout<<"preparing virtual array..."<<std::endl;
	size_t n = 1000000000ull;
	size_t p = 100000ull;
	GraphicsCardSupplyDepot gpu;
	VirtualMultiArray<Obj> arr(n,gpu.requestGpus(),p,3,{4,2,2});
	std::cout<<"initializing data..."<<std::endl;

	#pragma omp parallel for
	for(size_t i=0;i<10;i++)
	{
		arr.mappedReadWriteAccess(i*(n/10),n/10,[&](Obj * ptr){
			for(size_t j=i*(n/10);j<i*(n/10)+(n/10);j++)
			{
				Obj obj;
				obj.c=' ';

				ptr[j]=obj;
			}
		},false,false,true);
	}

	Obj searchObj;
	searchObj.c='X';
	arr.set(100,searchObj);
	arr.set(1000,searchObj);
	arr.set(10000,searchObj);
	arr.set(100000,searchObj);
	arr.set(1000000,searchObj);
	arr.set(10000000,searchObj);
	arr.set(100000000,searchObj);
	arr.set(1000000000,searchObj);
	std::cout<<"benchmarking find()..."<<std::endl;



	for(int i=0;i<80;i++)
	{
		std::chrono::milliseconds t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());



		// find at most 100 indices that have elements with c='X'
		std::vector<size_t> found = arr.find(searchObj,searchObj.c,100);


		std::chrono::milliseconds t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());

		std::cout<<"num found: "<<found.size()<<std::endl;
		if(found.size()>0)
		{
			std::cout<<"found: ";
			for(int h=0;h<found.size();h++)
			{
				std::cout<<found[h]<<" ";
			}
			std::cout<<std::endl;
		}
		std::cout<<"data search: "<<n*sizeof(int)/1000000.0<<"MB         "<<(t2.count()-t1.count())<<"ms"<<std::endl;
		std::cout<<"consumed RAM bandwidth: "<<(found.size()*sizeof(size_t) + gpu.requestGpus().size()*101*sizeof(size_t))/((t2.count()-t1.count())/1000.0)/1000000.0<<"MB/s"<<std::endl;



		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;
		std::cout<<"----------------------"<<std::endl;

	}
	return 0;
}

output:

num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000 
data search: 4000MB         207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------
num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000 
data search: 4000MB         207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------
num found: 8
found: 100 1000 10000 100000000 1000000000 1000000 100000 10000000 
data search: 4000MB         207ms
consumed RAM bandwidth: 0.0120193MB/s
----------------------
----------------------
----------------------

Clone this wiki locally