1
- // Licensed to the .NET Foundation under one or more agreements.
1
+ // Licensed to the .NET Foundation under one or more agreements.
2
2
// The .NET Foundation licenses this file to you under the MIT license.
3
3
4
4
using System . Collections . Immutable ;
@@ -18,6 +18,8 @@ partial class ExtractSearchIndex : IPostProcessor
18
18
[ GeneratedRegex ( @"\s+" ) ]
19
19
private static partial Regex s_regexWhiteSpace ( ) ;
20
20
21
+ private static readonly Regex s_regexCase = new ( @"[a-z0-9]+|[A-Z0-9]+[a-z0-9]*|[0-9]+" , RegexOptions . Compiled ) ;
22
+
21
23
private static readonly HashSet < string > s_htmlInlineTags = new ( StringComparer . OrdinalIgnoreCase )
22
24
{
23
25
"a" , "area" , "del" , "ins" , "link" , "map" , "meta" , "abbr" , "audio" , "b" , "bdo" , "button" , "canvas" , "cite" , "code" , "command" , "data" ,
@@ -29,12 +31,20 @@ partial class ExtractSearchIndex : IPostProcessor
29
31
public string Name => nameof ( ExtractSearchIndex ) ;
30
32
public const string IndexFileName = "index.json" ;
31
33
34
+ internal bool UseMetadata { get ; set ; } = false ;
35
+ internal bool UseMetadataTitle { get ; set ; } = true ;
36
+
32
37
public ImmutableDictionary < string , object > PrepareMetadata ( ImmutableDictionary < string , object > metadata )
33
38
{
34
39
if ( ! metadata . ContainsKey ( "_enableSearch" ) )
35
40
{
36
41
metadata = metadata . Add ( "_enableSearch" , true ) ;
37
42
}
43
+
44
+ UseMetadata = metadata . TryGetValue ( "_searchIndexUseMetadata" , out var useMetadataObject ) && ( bool ) useMetadataObject ;
45
+ UseMetadataTitle = ! metadata . TryGetValue ( "_searchIndexUseMetadataTitle" , out var useMetadataTitleObject ) || ( bool ) useMetadataTitleObject ;
46
+
47
+ Logger . LogInfo ( $ "{ Name } : { nameof ( UseMetadata ) } = { UseMetadata } , { nameof ( UseMetadataTitle ) } = { UseMetadataTitle } ") ;
38
48
return metadata ;
39
49
}
40
50
@@ -49,14 +59,15 @@ public Manifest Process(Manifest manifest, string outputFolder, CancellationToke
49
59
var htmlFiles = ( from item in manifest . Files ?? Enumerable . Empty < ManifestItem > ( )
50
60
from output in item . Output
51
61
where item . Type != "Toc" && output . Key . Equals ( ".html" , StringComparison . OrdinalIgnoreCase )
52
- select output . Value . RelativePath ) . ToList ( ) ;
62
+ select ( output . Value . RelativePath , item . Metadata ) ) . ToList ( ) ;
63
+
53
64
if ( htmlFiles . Count == 0 )
54
65
{
55
66
return manifest ;
56
67
}
57
68
58
69
Logger . LogInfo ( $ "Extracting index data from { htmlFiles . Count } html files") ;
59
- foreach ( var relativePath in htmlFiles )
70
+ foreach ( ( string relativePath , Dictionary < string , object > metadata ) in htmlFiles )
60
71
{
61
72
cancellationToken . ThrowIfCancellationRequested ( ) ;
62
73
@@ -76,7 +87,7 @@ from output in item.Output
76
87
Logger . LogWarning ( $ "Warning: Can't load content from { filePath } : { ex . Message } ") ;
77
88
continue ;
78
89
}
79
- var indexItem = ExtractItem ( html , relativePath ) ;
90
+ var indexItem = ExtractItem ( html , relativePath , metadata ) ;
80
91
if ( indexItem != null )
81
92
{
82
93
indexData [ relativePath ] = indexItem ;
@@ -99,7 +110,7 @@ from output in item.Output
99
110
return manifest ;
100
111
}
101
112
102
- internal SearchIndexItem ExtractItem ( HtmlDocument html , string href )
113
+ internal SearchIndexItem ExtractItem ( HtmlDocument html , string href , Dictionary < string , object > metadata = null )
103
114
{
104
115
var contentBuilder = new StringBuilder ( ) ;
105
116
@@ -117,10 +128,37 @@ internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
117
128
ExtractTextFromNode ( node , contentBuilder ) ;
118
129
}
119
130
120
- var content = NormalizeContent ( contentBuilder . ToString ( ) ) ;
121
- var title = ExtractTitleFromHtml ( html ) ;
131
+ string title ;
132
+ string summary = null ;
133
+ string keywords = null ;
122
134
123
- return new SearchIndexItem { Href = href , Title = title , Keywords = content } ;
135
+ var isMRef = metadata != null && metadata . TryGetValue ( "IsMRef" , out var isMRefMetadata ) && ( bool ) isMRefMetadata ;
136
+ if ( UseMetadata && isMRef )
137
+ {
138
+ title = UseMetadataTitle
139
+ ? ( string ) metadata [ "Title" ] ?? ExtractTitleFromHtml ( html )
140
+ : ExtractTitleFromHtml ( html ) ;
141
+
142
+ var htmlSummary = ( string ) metadata [ "Summary" ] ;
143
+ if ( ! string . IsNullOrEmpty ( htmlSummary ) )
144
+ {
145
+ var htmlDocument = new HtmlDocument ( ) ;
146
+ htmlDocument . LoadHtml ( htmlSummary ) ;
147
+ var htmlRootNode = htmlDocument . DocumentNode . FirstChild ;
148
+ var summaryBuilder = new StringBuilder ( ) ;
149
+ ExtractTextFromNode ( htmlRootNode , summaryBuilder ) ;
150
+ summary = NormalizeContent ( summaryBuilder . ToString ( ) ) ;
151
+ }
152
+
153
+ keywords = string . Join ( ' ' , title . Split ( ' ' ) . Select ( word => string . Join ( ' ' , GetStemAggregations ( word . Split ( '.' ) [ ^ 1 ] ) ) ) ) ;
154
+ }
155
+ else
156
+ {
157
+ title = ExtractTitleFromHtml ( html ) ;
158
+ summary = NormalizeContent ( contentBuilder . ToString ( ) ) ;
159
+ }
160
+
161
+ return new SearchIndexItem { Href = href , Title = title , Summary = summary , Keywords = keywords } ;
124
162
}
125
163
126
164
private static string ExtractTitleFromHtml ( HtmlDocument html )
@@ -140,6 +178,41 @@ private static string NormalizeContent(string str)
140
178
return s_regexWhiteSpace ( ) . Replace ( str , " " ) . Trim ( ) ;
141
179
}
142
180
181
+ private static string [ ] GetStems ( string str )
182
+ {
183
+ if ( string . IsNullOrEmpty ( str ) )
184
+ {
185
+ return [ string . Empty ] ;
186
+ }
187
+ str = WebUtility . HtmlDecode ( str ) ;
188
+ return s_regexCase . Matches ( str ) . Select ( m => m . Value ) . ToArray ( ) ;
189
+ }
190
+
191
+ private static List < string > GetStemAggregations ( string str )
192
+ {
193
+ var stems = GetStems ( str ) ;
194
+
195
+ var results = new List < string > ( ) ;
196
+ Aggregate ( stems , [ ] , results , 0 ) ;
197
+ return results ;
198
+
199
+ static void Aggregate ( string [ ] input , List < string > current , List < string > results , int index )
200
+ {
201
+ if ( index == input . Length )
202
+ {
203
+ return ;
204
+ }
205
+
206
+ for ( int i = index ; i < input . Length ; i ++ )
207
+ {
208
+ current . Add ( input [ i ] ) ;
209
+ results . Add ( string . Join ( string . Empty , current ) ) ;
210
+ Aggregate ( input , current , results , i + 1 ) ;
211
+ current . RemoveAt ( current . Count - 1 ) ;
212
+ }
213
+ }
214
+ }
215
+
143
216
private static void ExtractTextFromNode ( HtmlNode node , StringBuilder contentBuilder )
144
217
{
145
218
if ( node == null )
0 commit comments