11#include " RuCLIPProcessor.h"
22
33// /
4- torch::Tensor CVMatToTorchTensor (const cv::Mat img, const bool perm = true )
4+ inline torch::Tensor CVMatToTorchTensor (const cv::Mat img, const bool perm = true )
55{
66 auto tensor_image = torch::from_blob (img.data , { img.rows , img.cols , img.channels () }, at::kByte );
77 if (perm)
@@ -12,7 +12,7 @@ torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
1212}
1313
1414// /
15- cv::Mat TorchTensorToCVMat (const torch::Tensor tensor_image, const bool perm = true )
15+ inline cv::Mat TorchTensorToCVMat (const torch::Tensor tensor_image, const bool perm = true )
1616{
1717 auto t = tensor_image.detach ().squeeze ().cpu ();
1818 if (perm)
@@ -33,20 +33,19 @@ RuCLIPProcessor :: RuCLIPProcessor(
3333) : ImageSize(image_size), TextSeqLength(text_seq_length), NormMean(norm_mean), NormStd(norm_std)
3434{
3535 vkcom::Status status;
36- Tokenizer = new vkcom::BaseEncoder (tokenizer_path, -1 , &status);
36+ Tokenizer = std::make_unique< vkcom::BaseEncoder> (tokenizer_path, -1 , &status);
3737}
3838
3939// /!!!Локали-юникоды
40- torch::Tensor RuCLIPProcessor :: EncodeText(/* std::vector<*/ std::string &text)
40+ torch::Tensor RuCLIPProcessor :: EncodeText(const /* std::vector<*/ std::string &text) const
4141{
4242 std::vector<std::vector<int32_t >> ret_ids;
4343 vkcom::Status status;
4444 // //for (auto &it : text)
4545 // // it = lowercase(it);
4646 // text = lowercase(text);
4747 // output_type = vkcom::OutputType::ID, bos = false, eos = false, reverse = false, dropout_prob = 0.0
48- std::vector <std::string> texts;
49- texts.push_back (text);
48+ std::vector <std::string> texts{ text };
5049 status = Tokenizer->encode_as_ids (texts, &ret_ids);
5150 if (status.code != 0 )
5251 throw std::runtime_error (" RuCLIPProcessor::EncodeText error : " + status.message );
@@ -61,7 +60,51 @@ torch::Tensor RuCLIPProcessor :: EncodeText(/*std::vector<*/std::string &text)
6160 return PrepareTokens (it);
6261}
6362
64- torch::Tensor RuCLIPProcessor :: PrepareTokens(/* std::vector<*/ std::vector<int32_t > tokens) // Передаю по значению чтобы внутри иметь дело с копией
63+ // /
64+ cv::Mat RuCLIPProcessor::ResizeToInput (const cv::Mat& img, bool saveAspectRatio) const
65+ {
66+ cv::Mat newImg (cv::Size (ImageSize, ImageSize), img.type (), cv::Scalar (0 , 0 , 0 ));
67+
68+ if (saveAspectRatio)
69+ {
70+ // resize the image with aspect ratio
71+ float r = std::min (static_cast <float >(ImageSize) / static_cast <float >(img.rows ), static_cast <float >(ImageSize) / static_cast <float >(img.cols ));
72+ int newHeight = cvRound (img.rows * r);
73+ int newWidth = cvRound (img.cols * r);
74+
75+ // Additional checks for images with non even dims
76+ if ((ImageSize - newWidth) % 2 )
77+ newWidth--;
78+ if ((ImageSize - newHeight) % 2 )
79+ newHeight--;
80+ assert ((ImageSize - newWidth) % 2 == 0 );
81+ assert ((ImageSize - newHeight) % 2 == 0 );
82+
83+ int xOffset = (ImageSize - newWidth) / 2 ;
84+ int yOffset = (ImageSize - newHeight) / 2 ;
85+
86+ assert (2 * m_XOffset + newWidth == ImageSize);
87+ assert (2 * m_YOffset + newHeight == ImageSize);
88+
89+ cv::resize (img, newImg (cv::Rect (xOffset, yOffset, newWidth, newHeight)), cv::Size (newWidth, newHeight), 0 , 0 , cv::INTER_CUBIC);
90+ }
91+ else
92+ {
93+ cv::resize (img, newImg, newImg.size (), 0 , 0 , cv::INTER_CUBIC);
94+ }
95+ return newImg;
96+ }
97+
98+ // /
99+ torch::Tensor RuCLIPProcessor::EncodeImage (const cv::Mat& img) const
100+ {
101+ torch::Tensor img_tensor = CVMatToTorchTensor (ResizeToInput (img), true );
102+ img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
103+ return img_tensor;
104+ }
105+
106+ // /
107+ torch::Tensor RuCLIPProcessor::PrepareTokens (/* std::vector<*/ std::vector<int32_t > tokens) const // Передаю по значению чтобы внутри иметь дело с копией
65108{
66109 torch::Tensor result;
67110 if (tokens.size () > TextSeqLength)
@@ -89,7 +132,13 @@ void RuCLIPProcessor::CacheText(const std::vector <std::string>& texts)
89132}
90133
91134// /
92- std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator ()(const std::vector <std::string> &texts, const std::vector <cv::Mat> &images)
135+ const std::vector<torch::Tensor>& RuCLIPProcessor::GetTextTensors () const
136+ {
137+ return m_textsTensors;
138+ }
139+
140+ // /
141+ std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator ()(const std::vector <std::string> &texts, const std::vector <cv::Mat> &images) const
93142{
94143 std::vector <torch::Tensor> texts_tensors;
95144 for (auto & it : texts)
@@ -102,21 +151,21 @@ std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::v
102151 std::vector <torch::Tensor> images_tensors;
103152 for (auto &it : images)
104153 {
105- torch::Tensor img_tensor = CVMatToTorchTensor (it , true );
154+ torch::Tensor img_tensor = CVMatToTorchTensor (ResizeToInput (it) , true );
106155 img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
107156 // img_tensor.clone();
108157 images_tensors.push_back (img_tensor);
109158 }
110- return std::make_pair (/* torch::pad_sequence*/ torch::stack (texts_tensors), torch::pad_sequence (images_tensors).squeeze (0 ));
159+ return std::make_pair (!texts_tensors. empty ()? /* torch::pad_sequence*/ torch::stack (texts_tensors): torch::Tensor ( ), torch::pad_sequence (images_tensors).squeeze (0 ));
111160}
112161
113162// /
114- std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator ()(const std::vector <cv::Mat>& images)
163+ std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator ()(const std::vector <cv::Mat>& images) const
115164{
116165 std::vector <torch::Tensor> images_tensors;
117166 for (auto & it : images)
118167 {
119- torch::Tensor img_tensor = CVMatToTorchTensor (it , true );
168+ torch::Tensor img_tensor = CVMatToTorchTensor (ResizeToInput (it) , true );
120169 img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
121170 // img_tensor.clone();
122171 images_tensors.push_back (img_tensor);
0 commit comments