Skip to content

Commit c9967eb

Browse files
veloman-yunkankelson42
authored andcommitted
[HACK] Detection of omissions during title indexing
Xapian may silently omit words exceeding a certain size limit during indexing and there is no API to detect that it happened. This commit is a hacky way of detecting such omissions at the cost of false positives trigerred by titles containing too much whitespace and/or punctuation. Also decreased the limit on the max term size because of crashes in the newly extracted test-case Suggestion.handlingOfTooLongWords.
1 parent d419953 commit c9967eb

File tree

3 files changed

+72
-28
lines changed

3 files changed

+72
-28
lines changed

src/constants.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,11 @@
2020
#define ANCHOR_TERM "0posanchor "
2121

2222
#define DEFAULT_CLUSTER_SIZE 2*1024*1024
23+
24+
// The size in bytes of the longest word that is indexable in a title.
25+
// Xapian's default value is 64 while the hard limit is 245, however crashes
26+
// have been observed with values as low as 150 (demonstrated by the unit test
27+
// Suggestion.handlingOfTooLongWords in test/suggestion.cpp).
28+
// Note that a similar limit applies to full-text indexing but we don't
29+
// provide a way to control it (so it is at Xapian's default value of 64)
30+
#define MAX_INDEXABLE_TITLE_WORD_SIZE 64

src/writer/xapianIndexer.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,29 @@ size_t getTermCount(const Xapian::Document& d)
116116
return std::distance(d.termlist_begin(), d.termlist_end());
117117
}
118118

119+
size_t sizeOfIndexedText(const Xapian::Document& d)
120+
{
121+
size_t n = 0;
122+
for (auto termIt = d.termlist_begin(); termIt != d.termlist_end(); ++termIt) {
123+
const std::string& term = *termIt;
124+
if ( term[0] != 'Z' ) {
125+
n += termIt.get_wdf() * term.size();
126+
}
127+
}
128+
return n;
129+
}
130+
119131
} // unnamed namespace
120132

121133
/*
122134
* For title index, index the title with the full path (including the
123135
* namespace) as data of the document. The targetPath in valuesmap will store
124136
* the path without namespace.
125137
*
126-
* Note that terms (words) longer than 240 bytes are silently ignored.
138+
* An exception is thrown if the total size of non-indexable text in the title
139+
* exceeds MAX_INDEXABLE_TITLE_WORD_SIZE (this is intended to detect omission
140+
* of too long words during indexing but may be triggered by excessive
141+
* whitespace and/or punctuation as well).
127142
*
128143
* TODO:
129144
* Currently for title index we are storing path twice (redirectPath/path in
@@ -134,7 +149,7 @@ size_t getTermCount(const Xapian::Document& d)
134149

135150
void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath)
136151
{
137-
const size_t MAX_WORD_LENGTH = 240; // Xapian's hard limit is 245
152+
const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
138153

139154
assert(indexingMode == IndexingMode::TITLE);
140155
Xapian::Stem stemmer;
@@ -165,6 +180,9 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
165180
if (!unaccentedTitle.empty()) {
166181
std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
167182
indexer.index_text(anchoredTitle, 1);
183+
if ( anchoredTitle.size() >= sizeOfIndexedText(currentDocument) + MAX_WORD_LENGTH ) {
184+
throw std::runtime_error("Too much loss of data during title indexing");
185+
}
168186
if ( getTermCount(currentDocument) == 1 ) {
169187
// only ANCHOR_TERM was added, hence unaccentedTitle is made solely of
170188
// non-word characters. Then add entire title as a single term.

test/suggestion.cpp

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include "tools.h"
2929
#include "../src/tools.h"
30+
#include "../src/constants.h"
3031

3132
#include "gtest/gtest.h"
3233

@@ -705,28 +706,61 @@ std::string makeLongWord(size_t n) {
705706
return s + std::string(n - s.size(), s.back());
706707
}
707708

708-
TEST(Suggestion, titleEdgeCases) {
709-
const std::string shortOfBeingTooLong = makeLongWord(240);
710-
const std::string tooLong = makeLongWord(241);
709+
void createASingleEntryZimArchive(const std::string& title)
710+
{
711+
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {{ "path", title}});
712+
}
713+
714+
const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
715+
716+
TEST(Suggestion, handlingOfTooLongWords) {
717+
const std::string shortOfBeingTooLong = makeLongWord(MAX_WORD_LENGTH);
718+
const std::string tooLong = makeLongWord(MAX_WORD_LENGTH+1);
719+
720+
std::vector<std::string> titlesWithTooMuchDiscardableStuff{
721+
tooLong,
722+
"Is " + tooLong + " too long?",
723+
";-) " + tooLong,
724+
"too much whitespace" + std::string(MAX_WORD_LENGTH, ' '),
725+
"too much punctuation" + std::string(MAX_WORD_LENGTH, '!'),
726+
};
727+
728+
for ( const std::string& title : titlesWithTooMuchDiscardableStuff ) {
729+
EXPECT_THROW(createASingleEntryZimArchive(title), std::runtime_error)
730+
<< "title: " << title;
731+
}
711732

733+
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
734+
// { path , title }
735+
{ "path1", shortOfBeingTooLong },
736+
{ "path2", "Is " + shortOfBeingTooLong + " too long?" },
737+
{ "path3", shortOfBeingTooLong + " " + shortOfBeingTooLong },
738+
});
739+
740+
zim::Archive archive(tza.getPath());
741+
EXPECT_SUGGESTED_TITLES(archive, "long",
742+
"Is " + shortOfBeingTooLong + " too long?"
743+
);
744+
745+
EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
746+
shortOfBeingTooLong + " " + shortOfBeingTooLong,
747+
shortOfBeingTooLong,
748+
"Is " + shortOfBeingTooLong + " too long?"
749+
);
750+
}
751+
752+
TEST(Suggestion, titleEdgeCases) {
712753
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
713754
// { path , title }
714755

715756
{ "About" , "About" }, // Title identical to path
716757
{ "Trout" , "trout" }, // Title differing from path in case only
717758
{ "Without", "" }, // No title
718759
//
719-
// Titles containing long words
720-
{ "toolongword1", "Is " + shortOfBeingTooLong + " too long?" },
721-
{ "toolongword2", "Is " + tooLong + " too long?" },
722-
{ "toolongsingleword1", shortOfBeingTooLong },
723-
{ "toolongsingleword2", tooLong },
724-
725760
// Handling of pseudo-words consisting exclusively of punctuation
726761
{ "winknsmilewithouttext", ";-)" }, // A punctuation-only title
727762
{ "winknsmilebothways", ";-) wink'n'smile" },
728763
{ "winknsmiletheotherwayaround", "wink'n'smile ;-)" },
729-
{ "smilinglongword", ";-) " + tooLong },
730764
{ "winknsmilewithothernonwords", "~~ ;-) ~~" },
731765

732766
// Non edge cases
@@ -754,28 +788,12 @@ TEST(Suggestion, titleEdgeCases) {
754788
/* nothing */
755789
);
756790

757-
EXPECT_SUGGESTED_TITLES(archive, "long",
758-
"Is " + tooLong + " too long?",
759-
"Is " + shortOfBeingTooLong + " too long?"
760-
);
761-
762-
EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
763-
shortOfBeingTooLong,
764-
"Is " + shortOfBeingTooLong + " too long?"
765-
// The following results aren't included because tooLong has been ignored
766-
// during indexing:
767-
// - tooLong
768-
// - "Is " + tooLong + " too long?"
769-
// - ";-) " + tooLong
770-
);
771-
772791
EXPECT_SUGGESTED_TITLES(archive, ";-",
773792
";-)",
774793
// The following results aren't included because ";-)" isn't treated as a
775794
// term in the presence of anything else:
776795
// - ";-) wink'n'smile"
777796
// - "wink'n'smile ;-)"
778-
// - ";-) " + tooLong
779797
// - "~~ ;-) ~~"
780798
);
781799

0 commit comments

Comments
 (0)