1010"""Core classes."""
1111
1212import os
13- import re
1413from collections import OrderedDict
1514from datetime import datetime
1615from io import open
1716from pathlib import Path
1817
1918import markdown
19+ from bs4 import BeautifulSoup
2020from flask import abort , current_app , g
2121from werkzeug .local import LocalProxy
22+ from whoosh import index , qparser
23+ from whoosh .analysis import LanguageAnalyzer
24+ from whoosh .fields import ID , TEXT , Schema
25+ from whoosh .writing import AsyncWriter
2226
2327from .markdown_ext import BootstrapExtension
2428from .utils import clean_url , wikilink
@@ -44,7 +48,9 @@ def __init__(self, text):
4448
4549 self .md = markdown .Markdown (extensions = {
4650 BootstrapExtension (),
47- 'codehilite' , 'fenced_code' , 'toc' , 'meta' , 'tables'
51+ 'codehilite' ,
52+ 'fenced_code' ,
53+ 'toc' , 'meta' , 'tables'
4854 }.union (markdown_ext ))
4955
5056 self .input = text
@@ -167,6 +173,19 @@ def render(self):
167173 self .modification_datetime = datetime .fromtimestamp (
168174 os .path .getmtime (self .path ))
169175
176+ def index (self ):
177+ """Index page data for whoosh search engine."""
178+ index_dir = index .open_dir (current_app .config .get ('WIKI_INDEX_DIR' ))
179+ writer = AsyncWriter (index_dir )
180+ writer .update_document (
181+ url = self .url ,
182+ title = self .title ,
183+ body = self .raw_body ,
184+ tags = self .tags ,
185+ language = self .language
186+ )
187+ writer .commit ()
188+
170189 def save (self , update = True ):
171190 """Save a page."""
172191 folder = os .path .dirname (self .path )
@@ -178,6 +197,7 @@ def save(self, update=True):
178197 f .write (line )
179198 f .write (u'\n ' )
180199 f .write (self .body .replace (u'\r \n ' , u'\n ' ))
200+ self .index ()
181201 if update :
182202 self .load ()
183203 self .render ()
@@ -214,7 +234,6 @@ def title(self):
214234
215235 @title .setter
216236 def title (self , value ):
217- """."""
218237 self ['title' ] = value
219238
220239 @property
@@ -227,9 +246,23 @@ def tags(self):
227246
228247 @tags .setter
229248 def tags (self , value ):
230- """."""
231249 self ['tags' ] = value
232250
251+ @property
252+ def raw_body (self ):
253+ """Return raw text of the body.
254+
255+ Returns the raw text of the body without markdown or html markup,
256+ used for indexing and search results display.
257+ """
258+ html = markdown .markdown (self .body )
259+ html = BeautifulSoup (html , 'html.parser' )
260+ return html .get_text (separator = ' ' )
261+
262+ @raw_body .setter
263+ def raw_body (self , value ):
264+ self ['raw_body' ] = value
265+
233266 @property
234267 def language (self ):
235268 """Return page language.
@@ -239,11 +272,11 @@ def language(self):
239272 """
240273 filename = Path (self .path ).stem
241274 return filename .split ('_' )[- 1 ] if '_' in filename \
242- else current_wiki .languages [0 ]
275+ else list ( current_wiki .languages . keys ()) [0 ]
243276
244277
245278class WikiBase (object ):
246- """."""
279+ """Utility class for wiki management methods ."""
247280
248281 def __init__ (self , root ):
249282 """."""
@@ -309,9 +342,54 @@ def delete(self, url):
309342 if not self .exists (url ):
310343 return False
311344 os .remove (path )
345+ index_dir = index .open_dir (current_app .config .get ('WIKI_INDEX_DIR' ))
346+ writer = AsyncWriter (index_dir )
347+ writer .delete_by_term ('path' , path )
348+ writer .commit ()
312349 return True
313350
314- def index (self ):
351+ def init_search_index (self ):
352+ """Create a new whoosh search index for the wiki."""
353+ index_dir = current_app .config .get ('WIKI_INDEX_DIR' )
354+ # initialize whoosh index schema
355+ schema = Schema (
356+ url = ID (stored = True , unique = True ),
357+ title = TEXT (stored = True , analyzer = LanguageAnalyzer ("fr" )),
358+ tags = TEXT (stored = True ),
359+ body = TEXT (stored = True , analyzer = LanguageAnalyzer ("fr" )),
360+ language = ID (stored = True )
361+ )
362+ if not os .path .exists (index_dir ):
363+ os .mkdir (index_dir )
364+ index .create_in (index_dir , schema )
365+
366+ def search (self , query , ix , searcher ):
367+ """Search the whoosh index for a given query.
368+
369+ :param str query: the search query
370+ :param whoosh.index ix: the whoosh index to use
371+ :param whoosh.searcher searcher: an active whoosh searcher instance
372+
373+ :returns: a whoosh.results object instance
374+ """
375+ # parse the query to search all fields present in the schema
376+ fields = ix .schema .names ()
377+ query_parser = qparser .MultifieldParser (
378+ fields ,
379+ schema = ix .schema ,
380+ group = qparser .OrGroup
381+ )
382+ parsed_query = query_parser .parse (query )
383+ # return a whoosh Results object to treat results
384+ results = searcher .search (parsed_query )
385+ # set highlights fragment size to 50 words
386+ results .fragmenter .surround = 50
387+ # set highlights separator for display
388+ results .formatter .between = '<strong> [...] </strong>'
389+ # return the modified Results object
390+ return results
391+
392+ def list_pages (self ):
315393 """Build up a list of all the available pages.
316394
317395 :returns: a list of all the wiki pages
@@ -332,6 +410,11 @@ def index(self):
332410 pages .append (page )
333411 return sorted (pages , key = lambda x : x .title .lower ())
334412
413+ def index_all_pages (self ):
414+ """Index all the pages for the current wiki."""
415+ for page in self .list_pages ():
416+ Page .index (page )
417+
335418 def index_by (self , key ):
336419 """Get an index based on the given key.
337420
@@ -352,13 +435,13 @@ def index_by(self, key):
352435 return pages
353436
354437 def get_by_title (self , title ):
355- """."""
356- pages = self .index (attr = 'title' )
438+ """Get all page titles ."""
439+ pages = self .list_pages (attr = 'title' )
357440 return pages .get (title )
358441
359442 def get_tags (self ):
360- """."""
361- pages = self .index ()
443+ """Get all tags ."""
444+ pages = self .list_pages ()
362445 tags = {}
363446 for page in pages :
364447 pagetags = page .tags .split (',' )
@@ -372,9 +455,9 @@ def get_tags(self):
372455 tags [tag ] = [page ]
373456 return tags
374457
375- def index_by_tag (self , tag ):
376- """."""
377- pages = self .index ()
458+ def list_tagged_pages (self , tag ):
459+ """Get a list of all pages that have a tag ."""
460+ pages = self .list_pages ()
378461 tagged = [page for page in pages if tag in page .tags ]
379462 return sorted (tagged , key = lambda x : x .title .lower ())
380463
@@ -388,39 +471,6 @@ def languages(self):
388471 """."""
389472 return current_app .config .get ('WIKI_LANGUAGES' )
390473
391- def search (self , term , ignore_case = True , attrs = None ):
392- """."""
393- if attrs is None :
394- attrs = ['title' , 'tags' , 'body' ]
395- pages = self .index ()
396-
397- for page in pages :
398- page ["score" ] = 0
399-
400- # When searching for "*", return ALL pages
401- if term == "*" :
402- return pages
403-
404- current_language_pages = [
405- p for p in pages if p .language == self .current_language ]
406-
407- # If no query term, return all current language pages
408- if not term :
409- return current_language_pages
410-
411- regex = re .compile (
412- re .escape (term ), re .IGNORECASE if ignore_case else 0 )
413-
414- matched = []
415- for page in current_language_pages :
416- for attr in attrs :
417- if found := re .findall (regex , getattr (page , attr )):
418- page ["score" ] += len (found )
419- if page not in matched :
420- matched .append (page )
421- # Sort results by score
422- return sorted (matched , key = lambda x : x ["score" ], reverse = True )
423-
424474
425475def get_wiki ():
426476 """."""
0 commit comments