Skip to content

Commit df48c9f

Browse files
authored
Merge pull request #1004 from openzim/protection-against-too-long-single-word-titles
Protection against too long single word titles
2 parents 2712a8d + e3a9e9a commit df48c9f

File tree

5 files changed

+188
-31
lines changed

5 files changed

+188
-31
lines changed

include/zim/error.h

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,10 @@ namespace zim
5252
{}
5353
};
5454

55-
/* Exception thrown by the Creator in case of error.
55+
/**
56+
* Exception thrown by the Creator in case of error.
5657
*
57-
* Most exceptions actually thrown are inheriting this exception.
58+
* Most exceptions actually thrown are subclasses of this type.
5859
*/
5960
class LIBZIM_API CreatorError : public std::runtime_error
6061
{
@@ -64,7 +65,9 @@ namespace zim
6465
{}
6566
};
6667

67-
/* Exception thrown when a entry cannot be added to the Creator.*/
68+
/**
69+
* Exception thrown when an entry cannot be added to the ZIM archive.
70+
*/
6871
class LIBZIM_API InvalidEntry : public CreatorError
6972
{
7073
public:
@@ -73,14 +76,16 @@ namespace zim
7376
{}
7477
};
7578

76-
/* Exception thrown if a incoherence in the user implementation has been detected.
79+
/**
80+
* Exception thrown if incoherence in the user implementation has been
81+
* detected.
7782
*
7883
* Users need to implement interfaces such as:
79-
* - ContentProvider
80-
* - IndexData
81-
* - Item
84+
* - zim::writer::ContentProvider
85+
* - zim::writer::IndexData
86+
* - zim::writer::Item
8287
*
83-
* If a incoherence has been detected in those implementations a
88+
* If incoherence has been detected in those implementations an
8489
* `IncoherentImplementationError` will be thrown.
8590
*/
8691
class LIBZIM_API IncoherentImplementationError : public CreatorError
@@ -91,7 +96,26 @@ namespace zim
9196
{}
9297
};
9398

94-
/* Exception thrown in the main thread when another exception has been
99+
/**
100+
* Exception thrown when there are problems indexing a title.
101+
*
102+
* In the current implementation, the only situation deliberately targeted by
103+
* this type of error is when the title appears to contain a word that
104+
* exceeds the limit on the longest indexable word (as defined by
105+
* `MAX_INDEXABLE_TITLE_WORD_SIZE` in `src/constants.h`) but - due to hacky
106+
* implementation - titles containing too much whitespace and/or punctuation
107+
* may also trigger this error.
108+
*/
109+
class LIBZIM_API TitleIndexingError : public CreatorError
110+
{
111+
public:
112+
explicit TitleIndexingError(const std::string& message)
113+
: CreatorError(message)
114+
{}
115+
};
116+
117+
/**
118+
* Exception thrown in the main thread when another exception has been
95119
* thrown in another worker thread.
96120
*
97121
* Creator uses different worker threads to do background work.
@@ -144,10 +168,12 @@ namespace zim
144168
}
145169
};
146170

147-
/* Exception thrown when the creator is in error state.
171+
/**
172+
* Exception thrown when the creator is in error state.
148173
*
149-
* If the creator is in error state (mostly because a AsyncError has already
150-
* being thrown), any call to any method on it will thrown a `CreatorStateError`.
174+
* If the creator is in error state (mostly because an AsyncError has already
175+
* been thrown), any call to any method on it will throw a
176+
* `CreatorStateError`.
151177
*/
152178
class LIBZIM_API CreatorStateError : public CreatorError
153179
{

include/zim/writer/creator.h

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,24 +43,40 @@ namespace zim
4343
* Elements of the zim file can be added using the `add*` methods.
4444
* The final steps is to call `finishZimCreation`.
4545
*
46-
* During the creation of the zim file (and before the call to `finishZimCreation`),
47-
* some values must be set using the `set*` methods.
46+
* During the creation of the zim file (and before the call to
47+
* `finishZimCreation`), some values must be set using the `set*` methods.
48+
*
49+
* All `add*` methods and `finishZimCreation` can throw an exception.
50+
* (most of the time - though not necessarily - a subclass of
51+
* zim::CreatorError). It is up to the user to catch this exception and
52+
* handle the error.
53+
*
54+
* The current (documented) conditions when an exception is thrown are:
55+
*
56+
* - When an entry cannot be added (mainly because an entry with the same
57+
* path has already been added) a `zim::InvalidEntry` will be thrown. The
58+
* creator will still be in a valid state and the creation can continue.
59+
*
60+
* - When there are problems indexing the title a `zim::TitleIndexingError`
61+
* is thrown. The creator will remain in a valid state, however it is not
62+
* clear whether continuing the creation will result in a perfectly valid
63+
* ZIM archive.
64+
*
65+
* - An exception has been thrown in a worker thread. This exception will
66+
* be caught and rethrown via a `zim::AsyncError`. The creator will
67+
* be put in an invalid state and creation cannot continue.
68+
*
69+
* - The creator is in error state. A `zim::CreatorStateError` will be
70+
* thrown.
71+
*
72+
* - Any exception thrown by the user implementation itself. Note that if
73+
* this exception is thrown in a worker thread it will be exposed via an
74+
* AsyncError.
4875
*
49-
* All `add*` methods and `finishZimCreation` can throw a exception.
50-
* (most of the time zim::CreatorError child but not limited to)
51-
* It is up to the user to catch this exception and handle the error.
52-
* The current (documented) conditions when a exception is thrown are:
53-
* - When a entry cannot be added (mainly because a entry with the same path has already been added)
54-
* A `zim::InvalidEntry` will be thrown. The creator will still be in a valid state and the creation can continue.
55-
* - An exception has been thrown in a worker thread.
56-
* This exception will be catch and rethrown through a `zim::AsyncError`.
57-
* The creator will be set in a invalid state and creation cannot continue.
58-
* - The creator is in error state.
59-
* A `zim::CreatorStateError` will be thrown.
60-
* - Any exception thrown by user implementation itself.
61-
* Note that this exception may be thrown in a worker thread and so being "catch" by a AsyncError.
6276
* - Any other exception thrown for unknown reason.
63-
* By default, creator status is not changed by thrown exception and creation should stop.
77+
*
78+
* By default, creator status is not changed by thrown exception and
79+
* creation should stop.
6480
*/
6581
class LIBZIM_API Creator
6682
{

src/constants.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,11 @@
2020
#define ANCHOR_TERM "0posanchor "
2121

2222
#define DEFAULT_CLUSTER_SIZE 2*1024*1024
23+
24+
// The size in bytes of the longest word that is indexable in a title.
25+
// Xapian's default value is 64 while the hard limit is 245, however crashes
26+
// have been observed with values as low as 150 (demonstrated by the unit test
27+
// Suggestion.handlingOfTooLongWords in test/suggestion.cpp).
28+
// Note that a similar limit applies to full-text indexing but we don't
29+
// provide a way to control it (so it is at Xapian's default value of 64)
30+
#define MAX_INDEXABLE_TITLE_WORD_SIZE 64

src/writer/xapianIndexer.cpp

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "fs.h"
2525
#include "tools.h"
2626
#include "../constants.h"
27+
#include <zim/error.h>
2728
#include <sstream>
2829
#include <fstream>
2930
#include <stdexcept>
@@ -116,11 +117,30 @@ size_t getTermCount(const Xapian::Document& d)
116117
return std::distance(d.termlist_begin(), d.termlist_end());
117118
}
118119

120+
size_t sizeOfIndexedText(const Xapian::Document& d)
121+
{
122+
size_t n = 0;
123+
for (auto termIt = d.termlist_begin(); termIt != d.termlist_end(); ++termIt) {
124+
const std::string& term = *termIt;
125+
if ( term[0] != 'Z' ) {
126+
n += termIt.get_wdf() * term.size();
127+
}
128+
}
129+
return n;
130+
}
131+
119132
} // unnamed namespace
120133

121134
/*
122-
* For title index, index the full path with namespace as data of the document.
123-
* The targetPath in valuesmap will store the path without namespace.
135+
* For title index, index the title with the full path (including the
136+
* namespace) as data of the document. The targetPath in valuesmap will store
137+
* the path without namespace.
138+
*
139+
* An exception is thrown if the total size of non-indexable text in the title
140+
* exceeds MAX_INDEXABLE_TITLE_WORD_SIZE (this is intended to detect omission
141+
* of too long words during indexing but may be triggered by excessive
142+
* whitespace and/or punctuation as well).
143+
*
124144
* TODO:
125145
* Currently for title index we are storing path twice (redirectPath/path in
126146
* valuesmap and path in index data). In the future, we want to keep only one of
@@ -130,9 +150,12 @@ size_t getTermCount(const Xapian::Document& d)
130150

131151
void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath)
132152
{
153+
const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
154+
133155
assert(indexingMode == IndexingMode::TITLE);
134156
Xapian::Stem stemmer;
135157
Xapian::TermGenerator indexer;
158+
indexer.set_max_word_length(MAX_WORD_LENGTH);
136159
indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM);
137160
try {
138161
stemmer = Xapian::Stem(stemmer_language);
@@ -158,11 +181,16 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
158181
if (!unaccentedTitle.empty()) {
159182
std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
160183
indexer.index_text(anchoredTitle, 1);
184+
if ( anchoredTitle.size() >= sizeOfIndexedText(currentDocument) + MAX_WORD_LENGTH ) {
185+
throw zim::TitleIndexingError("Too much loss of data during title indexing");
186+
}
161187
if ( getTermCount(currentDocument) == 1 ) {
162188
// only ANCHOR_TERM was added, hence unaccentedTitle is made solely of
163189
// non-word characters. Then add entire title as a single term.
164190
currentDocument.remove_term(*currentDocument.termlist_begin());
165-
currentDocument.add_term(unaccentedTitle);
191+
if ( unaccentedTitle.size() <= MAX_WORD_LENGTH ) {
192+
currentDocument.add_term(unaccentedTitle);
193+
}
166194
}
167195
}
168196

test/suggestion.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
#include <zim/archive.h>
2323
#include <zim/suggestion.h>
2424
#include <zim/item.h>
25+
#include <zim/error.h>
2526

2627
#include <xapian.h>
2728

2829
#include "tools.h"
2930
#include "../src/tools.h"
31+
#include "../src/constants.h"
3032

3133
#include "gtest/gtest.h"
3234

@@ -695,13 +697,72 @@ TEST(Suggestion, CJK) {
695697
);
696698
}
697699

700+
std::string makeLongWord(size_t n) {
701+
std::ostringstream oss;
702+
oss << "awordthatis" << n << "characterslong";
703+
const std::string s = oss.str();
704+
if ( s.size() > n )
705+
throw std::runtime_error("That is not a request for a long enough word!");
706+
707+
return s + std::string(n - s.size(), s.back());
708+
}
709+
710+
void createASingleEntryZimArchive(const std::string& title)
711+
{
712+
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {{ "path", title}});
713+
}
714+
715+
const size_t MAX_WORD_LENGTH = MAX_INDEXABLE_TITLE_WORD_SIZE;
716+
717+
TEST(Suggestion, handlingOfTooLongWords) {
718+
const std::string shortOfBeingTooLong = makeLongWord(MAX_WORD_LENGTH);
719+
const std::string tooLong = makeLongWord(MAX_WORD_LENGTH+1);
720+
721+
std::vector<std::string> titlesWithTooMuchDiscardableStuff{
722+
tooLong,
723+
"Is " + tooLong + " too long?",
724+
";-) " + tooLong,
725+
"too much whitespace" + std::string(MAX_WORD_LENGTH, ' '),
726+
"too much punctuation" + std::string(MAX_WORD_LENGTH, '!'),
727+
};
728+
729+
for ( const std::string& title : titlesWithTooMuchDiscardableStuff ) {
730+
EXPECT_THROW(createASingleEntryZimArchive(title), zim::TitleIndexingError)
731+
<< "title: " << title;
732+
}
733+
734+
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
735+
// { path , title }
736+
{ "path1", shortOfBeingTooLong },
737+
{ "path2", "Is " + shortOfBeingTooLong + " too long?" },
738+
{ "path3", shortOfBeingTooLong + " " + shortOfBeingTooLong },
739+
});
740+
741+
zim::Archive archive(tza.getPath());
742+
EXPECT_SUGGESTED_TITLES(archive, "long",
743+
"Is " + shortOfBeingTooLong + " too long?"
744+
);
745+
746+
EXPECT_SUGGESTED_TITLES(archive, "awordthatis",
747+
shortOfBeingTooLong + " " + shortOfBeingTooLong,
748+
shortOfBeingTooLong,
749+
"Is " + shortOfBeingTooLong + " too long?"
750+
);
751+
}
752+
698753
TEST(Suggestion, titleEdgeCases) {
699754
TempZimArchiveMadeOfEmptyHtmlArticles tza("en", {
700755
// { path , title }
701756

702757
{ "About" , "About" }, // Title identical to path
703758
{ "Trout" , "trout" }, // Title differing from path in case only
704759
{ "Without", "" }, // No title
760+
//
761+
// Handling of pseudo-words consisting exclusively of punctuation
762+
{ "winknsmilewithouttext", ";-)" }, // A punctuation-only title
763+
{ "winknsmilebothways", ";-) wink'n'smile" },
764+
{ "winknsmiletheotherwayaround", "wink'n'smile ;-)" },
765+
{ "winknsmilewithothernonwords", "~~ ;-) ~~" },
705766

706767
// Non edge cases
707768
{ "Stout", "About Rex Stout" },
@@ -727,6 +788,24 @@ TEST(Suggestion, titleEdgeCases) {
727788
EXPECT_SUGGESTED_TITLES(archive, "hang"
728789
/* nothing */
729790
);
791+
792+
EXPECT_SUGGESTED_TITLES(archive, ";-",
793+
";-)",
794+
// The following results aren't included because ";-)" isn't treated as a
795+
// term in the presence of anything else:
796+
// - ";-) wink'n'smile"
797+
// - "wink'n'smile ;-)"
798+
// - "~~ ;-) ~~"
799+
);
800+
801+
EXPECT_SUGGESTED_TITLES(archive, "win",
802+
";-) wink'n'smile",
803+
"wink'n'smile ;-)"
804+
);
805+
806+
EXPECT_SUGGESTED_TITLES(archive, "smile",
807+
/* nothing */ // smile in "wink'n'smile" isn't a separate term
808+
);
730809
}
731810

732811
zim::Entry getTitleIndexEntry(const zim::Archive& a)

0 commit comments

Comments
 (0)