Skip to content

Commit d419953

Browse files
veloman-yunkankelson42
authored andcommitted
Increased the max word length for title indexing
1 parent 7cd6dcf commit d419953

File tree

2 files changed

+24
-20
lines changed

2 files changed

+24
-20
lines changed

src/writer/xapianIndexer.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,12 @@ size_t getTermCount(const Xapian::Document& d)
119119
} // unnamed namespace
120120

121121
/*
122-
* For title index, index the full path with namespace as data of the document.
123-
* The targetPath in valuesmap will store the path without namespace.
122+
* For title index, index the title with the full path (including the
123+
* namespace) as data of the document. The targetPath in valuesmap will store
124+
* the path without namespace.
125+
*
126+
* Note that terms (words) longer than 240 bytes are silently ignored.
127+
*
124128
* TODO:
125129
* Currently for title index we are storing path twice (redirectPath/path in
126130
* valuesmap and path in index data). In the future, we want to keep only one of
@@ -130,7 +134,7 @@ size_t getTermCount(const Xapian::Document& d)
130134

131135
void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath)
132136
{
133-
const size_t MAX_WORD_LENGTH = 64;
137+
const size_t MAX_WORD_LENGTH = 240; // Xapian's hard limit is 245
134138

135139
assert(indexingMode == IndexingMode::TITLE);
136140
Xapian::Stem stemmer;

test/suggestion.cpp

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -706,8 +706,8 @@ std::string makeLongWord(size_t n) {
706706
}
707707

708708
TEST(Suggestion, titleEdgeCases) {
709-
const std::string w64 = makeLongWord(64);
710-
const std::string w65 = makeLongWord(65);
709+
const std::string shortOfBeingTooLong = makeLongWord(240);
710+
const std::string tooLong = makeLongWord(241);
711711

712712
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
713713
// { path , title }
@@ -717,17 +717,17 @@ TEST(Suggestion, titleEdgeCases) {
717717
{ "Without", "" }, // No title
718718
//
719719
// Titles containing long words
720-
{ "toolongword1", "Is " + w64 + " too long?" },
721-
{ "toolongword2", "Is " + w65 + " too long?" },
722-
{ "toolongsingleword1", w64 },
723-
{ "toolongsingleword2", w65 },
720+
{ "toolongword1", "Is " + shortOfBeingTooLong + " too long?" },
721+
{ "toolongword2", "Is " + tooLong + " too long?" },
722+
{ "toolongsingleword1", shortOfBeingTooLong },
723+
{ "toolongsingleword2", tooLong },
724724

725725
// Handling of pseudo-words consisting exclusively of punctuation
726726
{ "winknsmilewithouttext", ";-)" }, // A punctuation-only title
727727
{ "winknsmilebothways", ";-) wink'n'smile" },
728728
{ "winknsmiletheotherwayaround", "wink'n'smile ;-)" },
729-
{ "smilinglongword", ";-) " + w65 },
730-
{ "winknsmilewithothernonwords", "~~ ;-) ~~" },
729+
{ "smilinglongword", ";-) " + tooLong },
730+
{ "winknsmilewithothernonwords", "~~ ;-) ~~" },
731731

732732
// Non edge cases
733733
{ "Stout", "About Rex Stout" },
@@ -755,18 +755,18 @@ TEST(Suggestion, titleEdgeCases) {
755755
);
756756

757757
EXPECT_SUGGESTED_TITLES(archive, "long",
758-
"Is " + w65 + " too long?",
759-
"Is " + w64 + " too long?"
758+
"Is " + tooLong + " too long?",
759+
"Is " + shortOfBeingTooLong + " too long?"
760760
);
761761

762762
EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
763-
w64,
764-
"Is " + w64 + " too long?"
765-
// The following results aren't included because w65 has been ignored
763+
shortOfBeingTooLong,
764+
"Is " + shortOfBeingTooLong + " too long?"
765+
// The following results aren't included because tooLong has been ignored
766766
// during indexing:
767-
// - w65
768-
// - "Is " + w65 + " too long?"
769-
// - ";-) " + w65
767+
// - tooLong
768+
// - "Is " + tooLong + " too long?"
769+
// - ";-) " + tooLong
770770
);
771771

772772
EXPECT_SUGGESTED_TITLES(archive, ";-",
@@ -775,7 +775,7 @@ TEST(Suggestion, titleEdgeCases) {
775775
// term in the presence of anything else:
776776
// - ";-) wink'n'smile"
777777
// - "wink'n'smile ;-)"
778-
// - ";-) " + w65
778+
// - ";-) " + tooLong
779779
// - "~~ ;-) ~~"
780780
);
781781

0 commit comments

Comments
 (0)