Skip to content

Commit 6d7a9c9

Browse files
authored
Merge pull request #466 from Nuzhny007/master
Update RuCLIP
2 parents a3c373d + f243dcc commit 6d7a9c9

File tree

5 files changed

+158
-80
lines changed

5 files changed

+158
-80
lines changed

thirdparty/ruclip/ClipAPI.cpp

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ class ClassificationCLIP::ClassificationCLIPImpl
1919
bool res = true;
2020

2121
m_pathToClip = pathToClip;
22-
m_pathToBPE = pathToBPE;
23-
m_inputImgSize = inputImgSize;
2422
m_indGPU = indGPU;
2523
m_labels = labels;
2624

@@ -42,11 +40,9 @@ class ClassificationCLIP::ClassificationCLIPImpl
4240
m_clip->to(m_device);
4341

4442
std::cout << "Load processor from: " << pathToBPE << std::endl;
45-
std::vector<double> normMean{ 0.48145466, 0.4578275, 0.40821073 };
46-
std::vector<double> normStd{ 0.26862954, 0.26130258, 0.27577711 };
47-
m_processor = std::make_unique<RuCLIPProcessor>(m_pathToBPE, m_inputImgSize, 77, normMean, normStd);
43+
m_processor = RuCLIPProcessor::FromPretrained(m_pathToClip);
4844

49-
m_processor->CacheText(m_labels);
45+
m_processor.CacheText(m_labels);
5046

5147
return res;
5248
}
@@ -69,12 +65,10 @@ class ClassificationCLIP::ClassificationCLIPImpl
6965
for (size_t i = 0; i < rois.size(); ++i)
7066
{
7167
cv::Rect r = Clamp(rois[i], frame.size());
72-
if (r.width > m_inputImgSize / 10 && r.height > m_inputImgSize / 10)
68+
if (r.width > m_processor.GetImageSize() / 10 && r.height > m_processor.GetImageSize() / 10)
7369
{
7470
img2roi[images.size()] = i;
75-
cv::Mat tmp;
76-
cv::resize(cv::Mat(frame, r), tmp, cv::Size(m_inputImgSize, m_inputImgSize), cv::INTER_CUBIC);
77-
images.emplace_back(tmp);
71+
images.emplace_back(cv::Mat(frame, r));
7872
}
7973
}
8074
if (images.empty())
@@ -84,7 +78,7 @@ class ClassificationCLIP::ClassificationCLIPImpl
8478
}
8579

8680
std::cout << "Running on " << images.size() << "..." << std::endl;
87-
auto dummy_input = m_processor->operator()(images);
81+
auto dummy_input = m_processor.operator()(images);
8882
try
8983
{
9084
torch::Tensor logits_per_image = m_clip->forward(dummy_input.first.to(m_device), dummy_input.second.to(m_device));
@@ -123,13 +117,11 @@ class ClassificationCLIP::ClassificationCLIPImpl
123117

124118
private:
125119
std::string m_pathToClip = "";
126-
std::string m_pathToBPE = "";
127-
int m_inputImgSize = 336;
128120
int m_indGPU = -1; // -1 - use CPU
129121

130122
torch::Device m_device{ torch::kCPU };
131123
CLIP m_clip = nullptr;
132-
std::unique_ptr<RuCLIPProcessor> m_processor;
124+
RuCLIPProcessor m_processor;
133125

134126
std::vector<std::string> m_labels{ "human", "pedestrian", "car", "vehicle", "truck", "bus" };
135127
};

thirdparty/ruclip/RuCLIP.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ torch::Tensor TransformerImpl :: forward(const torch::Tensor& x)
5757

5858
void TransformerImpl :: InitializeParameters()
5959
{
60-
float proj_std = powf(Width, -0.5f) * pow(2 * Layers, -0.5f);
61-
float attn_std = powf(Width, -0.5f);
62-
float fc_std = powf(2 * Width, -0.5f);
60+
float proj_std = powf((float)Width, -0.5f) * powf(2.f * Layers, -0.5f);
61+
float attn_std = powf((float)Width, -0.5f);
62+
float fc_std = powf(2.f * Width, -0.5f);
6363

6464
for (int i = 0; i < Resblocks->size(); i++)
6565
{
@@ -90,7 +90,7 @@ VisionTransformerImpl :: VisionTransformerImpl(
9090
) : torch::nn::Module(module_name), InputResolution(input_resolution), OutputDim(output_dim)
9191
{
9292
Conv1 = torch::nn::Conv2d(torch::nn::Conv2dOptions(3, width, patch_size).stride(patch_size).bias(false));
93-
float scale = powf(width, -0.5);
93+
float scale = powf((float)width, -0.5);
9494
ClassEmbedding = scale * torch::randn(width);
9595
PositionalEmbedding = scale * torch::randn({ (int)pow(input_resolution / patch_size/*деление нацело*/, 2) + 1, width });
9696
LnPre = RCLayerNorm(std::vector<int64_t>() = { (int64_t)width });
@@ -220,4 +220,4 @@ torch::Tensor CLIPImpl :: forward(torch::Tensor input_ids, torch::Tensor pixel_v
220220
auto logits_per_text = logits_per_image.t();
221221

222222
return logits_per_image;
223-
}
223+
}

thirdparty/ruclip/RuCLIP.h

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -165,31 +165,30 @@ inline CLIP FromPretrained(const std::filesystem::path &folder)
165165

166166
// Создание модели
167167
auto clip = CLIP("ruclip",
168-
int(config["embed_dim"]),
169-
int(config["image_resolution"]),
170-
int(config["vision_layers"]),
171-
int(config["vision_width"]),
172-
int(config["vision_patch_size"]),
173-
int(config["context_length"]),
174-
int(config["vocab_size"]),
175-
int(config["transformer_width"]),
176-
int(config["transformer_heads"]),
177-
int(config["transformer_layers"]));
178-
179-
for (auto &k : clip->named_parameters())
180-
std::cout << k.key() << std::endl;
168+
int(config["embed_dim"]),
169+
int(config["image_resolution"]),
170+
int(config["vision_layers"]),
171+
int(config["vision_width"]),
172+
int(config["vision_patch_size"]),
173+
int(config["context_length"]),
174+
int(config["vocab_size"]),
175+
int(config["transformer_width"]),
176+
int(config["transformer_heads"]),
177+
int(config["transformer_layers"]));
178+
179+
//for (auto &k : clip->named_parameters())
180+
// std::cout << k.key() << std::endl;
181181
//std::cout << "Model params count: " << Trainable::ParamsCount(clip) << std::endl;
182182

183183
// Загрузка состояния модели из файла
184-
try {
184+
try
185+
{
185186
torch::load(clip, (folder / "jit_model.zip").string());
186187
}
187-
catch (std::exception& e) {
188+
catch (std::exception& e)
189+
{
188190
std::cout << e.what() << std::endl;
189191
}
190192

191-
// "mean" : [0.48145466, 0.4578275, 0.40821073] ,
192-
// "std" : [0.26862954, 0.26130258, 0.27577711]
193-
194193
return clip;
195194
}

thirdparty/ruclip/RuCLIPProcessor.cpp

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include "RuCLIPProcessor.h"
22

33
///
4-
torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
4+
inline torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
55
{
66
auto tensor_image = torch::from_blob(img.data, { img.rows, img.cols, img.channels() }, at::kByte);
77
if (perm)
@@ -12,7 +12,7 @@ torch::Tensor CVMatToTorchTensor(const cv::Mat img, const bool perm = true)
1212
}
1313

1414
///
15-
cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true)
15+
inline cv::Mat TorchTensorToCVMat(const torch::Tensor tensor_image, const bool perm = true)
1616
{
1717
auto t = tensor_image.detach().squeeze().cpu();
1818
if (perm)
@@ -33,20 +33,19 @@ RuCLIPProcessor :: RuCLIPProcessor(
3333
) : ImageSize(image_size), TextSeqLength(text_seq_length), NormMean(norm_mean), NormStd(norm_std)
3434
{
3535
vkcom::Status status;
36-
Tokenizer = new vkcom::BaseEncoder(tokenizer_path, -1, &status);
36+
Tokenizer = std::make_unique<vkcom::BaseEncoder>(tokenizer_path, -1, &status);
3737
}
3838

3939
///!!!Локали-юникоды
40-
torch::Tensor RuCLIPProcessor :: EncodeText(/*std::vector<*/std::string &text)
40+
torch::Tensor RuCLIPProcessor :: EncodeText(const/*std::vector<*/std::string &text) const
4141
{
4242
std::vector<std::vector<int32_t>> ret_ids;
4343
vkcom::Status status;
4444
////for (auto &it : text)
4545
//// it = lowercase(it);
4646
//text = lowercase(text);
4747
//output_type = vkcom::OutputType::ID, bos = false, eos = false, reverse = false, dropout_prob = 0.0
48-
std::vector <std::string> texts;
49-
texts.push_back(text);
48+
std::vector <std::string> texts{ text };
5049
status = Tokenizer->encode_as_ids(texts, &ret_ids);
5150
if (status.code != 0)
5251
throw std::runtime_error("RuCLIPProcessor::EncodeText error : " + status.message);
@@ -61,7 +60,51 @@ torch::Tensor RuCLIPProcessor :: EncodeText(/*std::vector<*/std::string &text)
6160
return PrepareTokens(it);
6261
}
6362

64-
torch::Tensor RuCLIPProcessor :: PrepareTokens(/*std::vector<*/std::vector<int32_t> tokens) //Передаю по значению чтобы внутри иметь дело с копией
63+
///
64+
cv::Mat RuCLIPProcessor::ResizeToInput(const cv::Mat& img, bool saveAspectRatio) const
65+
{
66+
cv::Mat newImg(cv::Size(ImageSize, ImageSize), img.type(), cv::Scalar(0, 0, 0));
67+
68+
if (saveAspectRatio)
69+
{
70+
// resize the image with aspect ratio
71+
float r = std::min(static_cast<float>(ImageSize) / static_cast<float>(img.rows), static_cast<float>(ImageSize) / static_cast<float>(img.cols));
72+
int newHeight = cvRound(img.rows * r);
73+
int newWidth = cvRound(img.cols * r);
74+
75+
// Additional checks for images with non even dims
76+
if ((ImageSize - newWidth) % 2)
77+
newWidth--;
78+
if ((ImageSize - newHeight) % 2)
79+
newHeight--;
80+
assert((ImageSize - newWidth) % 2 == 0);
81+
assert((ImageSize - newHeight) % 2 == 0);
82+
83+
int xOffset = (ImageSize - newWidth) / 2;
84+
int yOffset = (ImageSize - newHeight) / 2;
85+
86+
assert(2 * m_XOffset + newWidth == ImageSize);
87+
assert(2 * m_YOffset + newHeight == ImageSize);
88+
89+
cv::resize(img, newImg(cv::Rect(xOffset, yOffset, newWidth, newHeight)), cv::Size(newWidth, newHeight), 0, 0, cv::INTER_CUBIC);
90+
}
91+
else
92+
{
93+
cv::resize(img, newImg, newImg.size(), 0, 0, cv::INTER_CUBIC);
94+
}
95+
return newImg;
96+
}
97+
98+
///
99+
torch::Tensor RuCLIPProcessor::EncodeImage(const cv::Mat& img) const
100+
{
101+
torch::Tensor img_tensor = CVMatToTorchTensor(ResizeToInput(img), true);
102+
img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
103+
return img_tensor;
104+
}
105+
106+
///
107+
torch::Tensor RuCLIPProcessor::PrepareTokens(/*std::vector<*/std::vector<int32_t> tokens) const //Передаю по значению чтобы внутри иметь дело с копией
65108
{
66109
torch::Tensor result;
67110
if (tokens.size() > TextSeqLength)
@@ -89,7 +132,13 @@ void RuCLIPProcessor::CacheText(const std::vector <std::string>& texts)
89132
}
90133

91134
///
92-
std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::vector <std::string> &texts, const std::vector <cv::Mat> &images)
135+
const std::vector<torch::Tensor>& RuCLIPProcessor::GetTextTensors() const
136+
{
137+
return m_textsTensors;
138+
}
139+
140+
///
141+
std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::vector <std::string> &texts, const std::vector <cv::Mat> &images) const
93142
{
94143
std::vector <torch::Tensor> texts_tensors;
95144
for (auto& it : texts)
@@ -102,21 +151,21 @@ std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::v
102151
std::vector <torch::Tensor> images_tensors;
103152
for (auto &it : images)
104153
{
105-
torch::Tensor img_tensor = CVMatToTorchTensor(it, true);
154+
torch::Tensor img_tensor = CVMatToTorchTensor(ResizeToInput(it), true);
106155
img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
107156
//img_tensor.clone();
108157
images_tensors.push_back(img_tensor);
109158
}
110-
return std::make_pair(/*torch::pad_sequence*/torch::stack(texts_tensors), torch::pad_sequence(images_tensors).squeeze(0));
159+
return std::make_pair(!texts_tensors.empty()?/*torch::pad_sequence*/torch::stack(texts_tensors):torch::Tensor(), torch::pad_sequence(images_tensors).squeeze(0));
111160
}
112161

113162
///
114-
std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::vector <cv::Mat>& images)
163+
std::pair<torch::Tensor, torch::Tensor> RuCLIPProcessor::operator()(const std::vector <cv::Mat>& images) const
115164
{
116165
std::vector <torch::Tensor> images_tensors;
117166
for (auto& it : images)
118167
{
119-
torch::Tensor img_tensor = CVMatToTorchTensor(it, true);
168+
torch::Tensor img_tensor = CVMatToTorchTensor(ResizeToInput(it), true);
120169
img_tensor = torch::data::transforms::Normalize<>(NormMean, NormStd)(img_tensor);
121170
//img_tensor.clone();
122171
images_tensors.push_back(img_tensor);

0 commit comments

Comments
 (0)