Skip to content

Commit 754f9f9

Browse files
Testing the handling of too long words in titles
Xapian's parser (TermGenerator) discards words/terms longer than a certain limit (default: 64). However terms can be added to the indexed documents directly via Xapian::Document::add_term(). We do that in order to index titles that are made fully of non-word characters but our implementation opens up a loophole for words of arbitrary length to slip in (when the title is a single word). That leads to crashes if Xapian's hard limit on the length of a term (max 245 characters) is exceeded. The new unit test demonstrates the existence of a loophole.
1 parent 9ca8eb0 commit 754f9f9

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

test/suggestion.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,13 +695,32 @@ TEST(Suggestion, CJK) {
695695
);
696696
}
697697

698+
std::string makeLongWord(size_t n) {
699+
std::ostringstream oss;
700+
oss << "awordthatis" << n << "characterslong";
701+
const std::string s = oss.str();
702+
if ( s.size() > n )
703+
throw std::runtime_error("That is not a request for a long enough word!");
704+
705+
return s + std::string(n - s.size(), s.back());
706+
}
707+
698708
TEST(Suggestion, titleEdgeCases) {
709+
const std::string w64 = makeLongWord(64);
710+
const std::string w65 = makeLongWord(65);
711+
699712
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
700713
// { path , title }
701714

702715
{ "About" , "About" }, // Title identical to path
703716
{ "Trout" , "trout" }, // Title differing from path in case only
704717
{ "Without", "" }, // No title
718+
//
719+
// Titles containing long words
720+
{ "toolongword1", "Is " + w64 + " too long?" },
721+
{ "toolongword2", "Is " + w65 + " too long?" },
722+
{ "toolongsingleword1", w64 },
723+
{ "toolongsingleword2", w65 },
705724

706725
// Non edge cases
707726
{ "Stout", "About Rex Stout" },
@@ -727,6 +746,19 @@ TEST(Suggestion, titleEdgeCases) {
727746
EXPECT_SUGGESTED_TITLES(archive, "hang"
728747
/* nothing */
729748
);
749+
750+
EXPECT_SUGGESTED_TITLES(archive, "long",
751+
"Is " + w65 + " too long?",
752+
"Is " + w64 + " too long?"
753+
);
754+
755+
EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
756+
w65, // a very long word slips in when it is the only word of a title
757+
w64,
758+
"Is " + w64 + " too long?"
759+
// "Is " + w65 + " too long?" isn't included because w65 has been ignored
760+
// during indexing
761+
);
730762
}
731763

732764
zim::Entry getTitleIndexEntry(const zim::Archive& a)

0 commit comments

Comments
 (0)