[HACK] Detection of omissions during title indexing

veloman-yunkan · kelson42 · commit c9967eb10fb7 · 2026-01-13T20:36:34.000+01:00
Xapian may silently omit words exceeding a certain size limit during
indexing and there is no API to detect that it happened. This commit
is a hacky way of detecting such omissions at the cost of false
positives trigerred by titles containing too much whitespace and/or
punctuation.

Also decreased the limit on the max term size because of crashes
in the newly extracted test-case Suggestion.handlingOfTooLongWords.
diff --git a/src/constants.h b/src/constants.h
@@ -20,3 +20,11 @@
 #define ANCHOR_TERM "0posanchor "
 
 #define DEFAULT_CLUSTER_SIZE 2*1024*1024
+
+// The size in bytes of the longest word that is indexable in a title.
+// Xapian's default value is 64 while the hard limit is 245, however crashes
+// have been observed with values as low as 150 (demonstrated by the unit test
+// Suggestion.handlingOfTooLongWords in test/suggestion.cpp).
+// Note that a similar limit applies to full-text indexing but we don't
+// provide a way to control it (so it is at Xapian's default value of 64)
+#define MAX_INDEXABLE_TITLE_WORD_SIZE 64
diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp
@@ -116,14 +116,29 @@ size_t getTermCount(const Xapian::Document& d)
   return std::distance(d.termlist_begin(), d.termlist_end());
 }
 
+size_t sizeOfIndexedText(const Xapian::Document& d)
+{
+  size_t n = 0;
+  for (auto termIt = d.termlist_begin(); termIt != d.termlist_end(); ++termIt) {
+    const std::string& term = *termIt;
+    if ( term[0] != 'Z' ) {
+      n += termIt.get_wdf() * term.size();
+    }
+  }
+  return n;
+}
+
 } // unnamed namespace
 
 /*
  * For title index, index the title with the full path (including the
  * namespace) as data of the document. The targetPath in valuesmap will store
  * the path without namespace.
  *
- * Note that terms (words) longer than 240 bytes are silently ignored.
+ * An exception is thrown if the total size of non-indexable text in the title
+ * exceeds MAX_INDEXABLE_TITLE_WORD_SIZE (this is intended to detect omission
+ * of too long words during indexing but may be triggered by excessive
+ * whitespace and/or punctuation as well).
  *
  * TODO:
  * Currently for title index we are storing path twice (redirectPath/path in
@@ -134,7 +149,7 @@ size_t getTermCount(const Xapian::Document& d)
 
 void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath)
 {
-  const size_t MAX_WORD_LENGTH = 240; // Xapian's hard limit is 245
+  const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
 
   assert(indexingMode == IndexingMode::TITLE);
   Xapian::Stem stemmer;
@@ -165,6 +180,9 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
   if (!unaccentedTitle.empty()) {
     std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
     indexer.index_text(anchoredTitle, 1);
+    if ( anchoredTitle.size() >= sizeOfIndexedText(currentDocument) + MAX_WORD_LENGTH ) {
+      throw std::runtime_error("Too much loss of data during title indexing");
+    }
     if ( getTermCount(currentDocument) == 1 ) {
       // only ANCHOR_TERM was added, hence unaccentedTitle is made solely of
       // non-word characters. Then add entire title as a single term.
diff --git a/test/suggestion.cpp b/test/suggestion.cpp
@@ -27,6 +27,7 @@
 
 #include "tools.h"
 #include "../src/tools.h"
+#include "../src/constants.h"
 
 #include "gtest/gtest.h"
 
@@ -705,28 +706,61 @@ std::string makeLongWord(size_t n) {
   return s + std::string(n - s.size(), s.back());
 }
 
-TEST(Suggestion, titleEdgeCases) {
-  const std::string shortOfBeingTooLong = makeLongWord(240);
-  const std::string tooLong = makeLongWord(241);
+void createASingleEntryZimArchive(const std::string& title)
+{
+  TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {{ "path", title}});
+}
+
+const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
+
+TEST(Suggestion, handlingOfTooLongWords) {
+  const std::string shortOfBeingTooLong = makeLongWord(MAX_WORD_LENGTH);
+  const std::string tooLong = makeLongWord(MAX_WORD_LENGTH+1);
+
+  std::vector<std::string> titlesWithTooMuchDiscardableStuff{
+        tooLong,
+        "Is " + tooLong + " too long?",
+        ";-) " + tooLong,
+        "too much whitespace"  + std::string(MAX_WORD_LENGTH, ' '),
+        "too much punctuation" + std::string(MAX_WORD_LENGTH, '!'),
+  };
+
+  for ( const std::string& title : titlesWithTooMuchDiscardableStuff ) {
+    EXPECT_THROW(createASingleEntryZimArchive(title), std::runtime_error)
+      << "title: " << title;
+  }
 
+  TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
+     // { path                , title   }
+        { "path1", shortOfBeingTooLong                             },
+        { "path2", "Is " + shortOfBeingTooLong + " too long?"      },
+        { "path3", shortOfBeingTooLong + " " + shortOfBeingTooLong },
+  });
+
+  zim::Archive archive(tza.getPath());
+  EXPECT_SUGGESTED_TITLES(archive, "long",
+      "Is " + shortOfBeingTooLong + " too long?"
+  );
+
+  EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
+      shortOfBeingTooLong + " " + shortOfBeingTooLong,
+      shortOfBeingTooLong,
+      "Is " + shortOfBeingTooLong + " too long?"
+  );
+}
+
+TEST(Suggestion, titleEdgeCases) {
   TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
      // { path     , title   }
 
         { "About"  , "About" }, // Title identical to path
         { "Trout"  , "trout" }, // Title differing from path in case only
         { "Without", ""      }, // No title
                                 //
-        // Titles containing long words
-        { "toolongword1",      "Is " + shortOfBeingTooLong + " too long?" },
-        { "toolongword2",      "Is " + tooLong + " too long?"             },
-        { "toolongsingleword1", shortOfBeingTooLong                       },
-        { "toolongsingleword2", tooLong                                   },
-
         // Handling of pseudo-words consisting exclusively of punctuation
         { "winknsmilewithouttext",          ";-)" }, // A punctuation-only title
         { "winknsmilebothways",             ";-) wink'n'smile" },
         { "winknsmiletheotherwayaround",    "wink'n'smile ;-)" },
-        { "smilinglongword",                ";-) " + tooLong   },
         { "winknsmilewithothernonwords",    "~~ ;-) ~~"        },
 
         // Non edge cases
@@ -754,28 +788,12 @@ TEST(Suggestion, titleEdgeCases) {
       /* nothing */
   );
 
-  EXPECT_SUGGESTED_TITLES(archive, "long",
-      "Is " + tooLong + " too long?",
-      "Is " + shortOfBeingTooLong + " too long?"
-  );
-
-  EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
-      shortOfBeingTooLong,
-      "Is " + shortOfBeingTooLong + " too long?"
-      // The following results aren't included because tooLong has been ignored
-      // during indexing:
-      // - tooLong
-      // - "Is " + tooLong + " too long?"
-      // - ";-) " + tooLong
-  );
-
   EXPECT_SUGGESTED_TITLES(archive, ";-",
       ";-)",
       // The following results aren't included because ";-)" isn't treated as a
       // term in the presence of anything else:
       // - ";-) wink'n'smile"
       // - "wink'n'smile ;-)"
-      // - ";-) " + tooLong
       // - "~~ ;-) ~~"
   );