Skip to content

Commit a937b1f

Browse files
committed
search: implement whoosh search engine
* Updates README. * Also adds some cosmetic fixes. * Adds a CLI with commands to initialize the index and index all pages. Co-authored-by: Pascal Repond <[email protected]>
1 parent 89410af commit a937b1f

File tree

10 files changed

+212
-67
lines changed

10 files changed

+212
-67
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,7 @@ dmypy.json
128128
*.mo
129129

130130
# Macosx
131-
.DS_Store
131+
.DS_Store
132+
133+
# Whoosh index
134+
/examples/index/

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,18 @@ Simple file based wiki for Flask.
88

99
### Requirements
1010

11-
* Python >=3.6.2
11+
* Python >=3.8.0,<4.0.0
1212
* [Poetry](https://python-poetry.org/)
1313

14-
### Install
14+
### Install dev environment
1515

1616
- Clone the git repository
1717
- run `poetry install`
18-
- If you want to enable debug mode, run `export FLASK_ENV=development`
19-
- `cd examples`, then `poetry run flask run`
20-
- go to http://localhost:5000/wiki
18+
- `cd examples`,
19+
- `poetry run flask flask_wiki init-index`
20+
- `poetry run flask flask_wiki index`
21+
- then `poetry run flask run --debug`
22+
- go to http://localhost:5000/help
2123

2224
## Configuration
2325

@@ -36,8 +38,9 @@ Simple file based wiki for Flask.
3638
- WIKI_HOME = 'home'
3739
- WIKI_CURRENT_LANGUAGE = lambda: 'en'
3840
- WIKI_LANGUAGES = ['en']
39-
- WIKI_URL_PREFIX = '/wiki'
41+
- WIKI_URL_PREFIX = '/help'
4042
- WIKI_CONTENT_DIR = './data'
43+
- WIKI_INDEX_DIR = './index'
4144
- WIKI_UPLOAD_FOLDER = os.path.join(WIKI_CONTENT_DIR, 'files')
4245
- WIKI_ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'svg'}
4346
- WIKI_MARKDOWN_EXTENSIONS = set(('codehilite', 'fenced_code'))

examples/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def get_locale():
4949
app.config.from_mapping(test_config)
5050
Bootstrap4(app)
5151
Wiki(app)
52-
babel = Babel(app, locale_selector=get_locale)
52+
Babel(app, locale_selector=get_locale)
5353

5454
@app.context_processor
5555
def inject_conf_var():

flask_wiki/api.py

Lines changed: 97 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,19 @@
1010
"""Core classes."""
1111

1212
import os
13-
import re
1413
from collections import OrderedDict
1514
from datetime import datetime
1615
from io import open
1716
from pathlib import Path
1817

1918
import markdown
19+
from bs4 import BeautifulSoup
2020
from flask import abort, current_app, g
2121
from werkzeug.local import LocalProxy
22+
from whoosh import index, qparser
23+
from whoosh.analysis import LanguageAnalyzer
24+
from whoosh.fields import ID, TEXT, Schema
25+
from whoosh.writing import AsyncWriter
2226

2327
from .markdown_ext import BootstrapExtension
2428
from .utils import clean_url, wikilink
@@ -44,7 +48,9 @@ def __init__(self, text):
4448

4549
self.md = markdown.Markdown(extensions={
4650
BootstrapExtension(),
47-
'codehilite', 'fenced_code', 'toc', 'meta', 'tables'
51+
'codehilite',
52+
'fenced_code',
53+
'toc', 'meta', 'tables'
4854
}.union(markdown_ext))
4955

5056
self.input = text
@@ -167,6 +173,19 @@ def render(self):
167173
self.modification_datetime = datetime.fromtimestamp(
168174
os.path.getmtime(self.path))
169175

176+
def index(self):
177+
"""Index page data for whoosh search engine."""
178+
index_dir = index.open_dir(current_app.config.get('WIKI_INDEX_DIR'))
179+
writer = AsyncWriter(index_dir)
180+
writer.update_document(
181+
url=self.url,
182+
title=self.title,
183+
body=self.raw_body,
184+
tags=self.tags,
185+
language=self.language
186+
)
187+
writer.commit()
188+
170189
def save(self, update=True):
171190
"""Save a page."""
172191
folder = os.path.dirname(self.path)
@@ -178,6 +197,7 @@ def save(self, update=True):
178197
f.write(line)
179198
f.write(u'\n')
180199
f.write(self.body.replace(u'\r\n', u'\n'))
200+
self.index()
181201
if update:
182202
self.load()
183203
self.render()
@@ -214,7 +234,6 @@ def title(self):
214234

215235
@title.setter
216236
def title(self, value):
217-
"""."""
218237
self['title'] = value
219238

220239
@property
@@ -227,9 +246,23 @@ def tags(self):
227246

228247
@tags.setter
229248
def tags(self, value):
230-
"""."""
231249
self['tags'] = value
232250

251+
@property
252+
def raw_body(self):
253+
"""Return raw text of the body.
254+
255+
Returns the raw text of the body without markdown or html markup,
256+
used for indexing and search results display.
257+
"""
258+
html = markdown.markdown(self.body)
259+
html = BeautifulSoup(html, 'html.parser')
260+
return html.get_text(separator=' ')
261+
262+
@raw_body.setter
263+
def raw_body(self, value):
264+
self['raw_body'] = value
265+
233266
@property
234267
def language(self):
235268
"""Return page language.
@@ -239,11 +272,11 @@ def language(self):
239272
"""
240273
filename = Path(self.path).stem
241274
return filename.split('_')[-1] if '_' in filename\
242-
else current_wiki.languages[0]
275+
else list(current_wiki.languages.keys())[0]
243276

244277

245278
class WikiBase(object):
246-
"""."""
279+
"""Utility class for wiki management methods."""
247280

248281
def __init__(self, root):
249282
"""."""
@@ -309,9 +342,54 @@ def delete(self, url):
309342
if not self.exists(url):
310343
return False
311344
os.remove(path)
345+
index_dir = index.open_dir(current_app.config.get('WIKI_INDEX_DIR'))
346+
writer = AsyncWriter(index_dir)
347+
writer.delete_by_term('path', path)
348+
writer.commit()
312349
return True
313350

314-
def index(self):
351+
def init_search_index(self):
352+
"""Create a new whoosh search index for the wiki."""
353+
index_dir = current_app.config.get('WIKI_INDEX_DIR')
354+
# initialize whoosh index schema
355+
schema = Schema(
356+
url=ID(stored=True, unique=True),
357+
title=TEXT(stored=True, analyzer=LanguageAnalyzer("fr")),
358+
tags=TEXT(stored=True),
359+
body=TEXT(stored=True, analyzer=LanguageAnalyzer("fr")),
360+
language=ID(stored=True)
361+
)
362+
if not os.path.exists(index_dir):
363+
os.mkdir(index_dir)
364+
index.create_in(index_dir, schema)
365+
366+
def search(self, query, ix, searcher):
367+
"""Search the whoosh index for a given query.
368+
369+
:param str query: the search query
370+
:param whoosh.index ix: the whoosh index to use
371+
:param whoosh.searcher searcher: an active whoosh searcher instance
372+
373+
:returns: a whoosh.results object instance
374+
"""
375+
# parse the query to search all fields present in the schema
376+
fields = ix.schema.names()
377+
query_parser = qparser.MultifieldParser(
378+
fields,
379+
schema=ix.schema,
380+
group=qparser.OrGroup
381+
)
382+
parsed_query = query_parser.parse(query)
383+
# return a whoosh Results object to treat results
384+
results = searcher.search(parsed_query)
385+
# set highlights fragment size to 50 words
386+
results.fragmenter.surround = 50
387+
# set highlights separator for display
388+
results.formatter.between = '<strong> [...] </strong>'
389+
# return the modified Results object
390+
return results
391+
392+
def list_pages(self):
315393
"""Build up a list of all the available pages.
316394
317395
:returns: a list of all the wiki pages
@@ -332,6 +410,11 @@ def index(self):
332410
pages.append(page)
333411
return sorted(pages, key=lambda x: x.title.lower())
334412

413+
def index_all_pages(self):
414+
"""Index all the pages for the current wiki."""
415+
for page in self.list_pages():
416+
Page.index(page)
417+
335418
def index_by(self, key):
336419
"""Get an index based on the given key.
337420
@@ -352,13 +435,13 @@ def index_by(self, key):
352435
return pages
353436

354437
def get_by_title(self, title):
355-
"""."""
356-
pages = self.index(attr='title')
438+
"""Get all page titles."""
439+
pages = self.list_pages(attr='title')
357440
return pages.get(title)
358441

359442
def get_tags(self):
360-
"""."""
361-
pages = self.index()
443+
"""Get all tags."""
444+
pages = self.list_pages()
362445
tags = {}
363446
for page in pages:
364447
pagetags = page.tags.split(',')
@@ -372,9 +455,9 @@ def get_tags(self):
372455
tags[tag] = [page]
373456
return tags
374457

375-
def index_by_tag(self, tag):
376-
"""."""
377-
pages = self.index()
458+
def list_tagged_pages(self, tag):
459+
"""Get a list of all pages that have a tag."""
460+
pages = self.list_pages()
378461
tagged = [page for page in pages if tag in page.tags]
379462
return sorted(tagged, key=lambda x: x.title.lower())
380463

@@ -388,39 +471,6 @@ def languages(self):
388471
"""."""
389472
return current_app.config.get('WIKI_LANGUAGES')
390473

391-
def search(self, term, ignore_case=True, attrs=None):
392-
"""."""
393-
if attrs is None:
394-
attrs = ['title', 'tags', 'body']
395-
pages = self.index()
396-
397-
for page in pages:
398-
page["score"] = 0
399-
400-
# When searching for "*", return ALL pages
401-
if term == "*":
402-
return pages
403-
404-
current_language_pages = [
405-
p for p in pages if p.language == self.current_language]
406-
407-
# If no query term, return all current language pages
408-
if not term:
409-
return current_language_pages
410-
411-
regex = re.compile(
412-
re.escape(term), re.IGNORECASE if ignore_case else 0)
413-
414-
matched = []
415-
for page in current_language_pages:
416-
for attr in attrs:
417-
if found := re.findall(regex, getattr(page, attr)):
418-
page["score"] += len(found)
419-
if page not in matched:
420-
matched.append(page)
421-
# Sort results by score
422-
return sorted(matched, key=lambda x: x["score"], reverse=True)
423-
424474

425475
def get_wiki():
426476
"""."""

flask_wiki/cli.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# This file is part of Flask-Wiki
4+
# Copyright (C) 2023 RERO
5+
#
6+
# Flask-Wiki is free software; you can redistribute it and/or modify
7+
# it under the terms of the Revised BSD License; see LICENSE file for
8+
# more details.
9+
10+
"""Click command-line interface for flask-wiki."""
11+
12+
import click
13+
from flask.cli import with_appcontext
14+
15+
from .api import get_wiki
16+
17+
18+
@click.group()
19+
def flask_wiki():
20+
"""Command-line interface for flask-wiki."""
21+
pass
22+
23+
24+
@flask_wiki.command()
25+
@with_appcontext
26+
def init_index():
27+
"""Init whoosh search index."""
28+
get_wiki().init_search_index()
29+
30+
31+
@flask_wiki.command()
32+
@with_appcontext
33+
def index():
34+
"""Index all wiki pages for whoosh search."""
35+
get_wiki().index_all_pages()

flask_wiki/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@
2727
WIKI_HOME = 'home'
2828
WIKI_CURRENT_LANGUAGE = lambda: 'en'
2929
WIKI_LANGUAGES = ['en']
30-
WIKI_URL_PREFIX = '/wiki'
30+
WIKI_URL_PREFIX = '/help'
3131
WIKI_CONTENT_DIR = './data'
3232
WIKI_UPLOAD_FOLDER = os.path.join(WIKI_CONTENT_DIR, 'files')
33+
WIKI_INDEX_DIR = './index'
3334
WIKI_ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'svg'}
3435

3536
"""Markdown Extensions.

flask_wiki/templates/wiki/search.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,20 @@
1919
<header>
2020
<div class="pb-3">
2121
{{ results | length }} {{ ngettext('result', 'results', results | length) }}
22-
<a href="{{ url_for('wiki.search', q='*') }}" class="btn btn-sm btn-outline-primary ml-2">{{ _("All languages") }}</a>
2322
</div>
2423
</header>
2524
<div>
2625
<ul class="list-group list-group-flush">
2726
{%- for result in results -%}
2827
<li class="list-group-item">
2928
<div class="d-lg-flex justify-content-lg-between">
30-
<article class="mb-2">
29+
<article class="mb-2 col-10">
3130
<h5 class="m-0">
3231
<a class="mr-2" href="{{ url_for('wiki.page', url=result.url) }}">
3332
{{ result.title }}
3433
</a>
3534
</h5>
35+
<p>{{ result.highlights("body", top=3) | safe }}</p>
3636
<ul class="m-0 p-0">
3737
<li class="badge badge-secondary">
3838
{{ result.language | upper }}
@@ -45,7 +45,7 @@ <h5 class="m-0">
4545
</ul>
4646
</article>
4747
{% if can_edit_wiki %}
48-
<footer>
48+
<footer class="ml-3 col-4">
4949
<button data-name="{{ result.title }}"
5050
data-link="{{ url_for('wiki.page', url=result.url) }}"
5151
class="copy-file-code btn btn-sm btn-outline-primary">

0 commit comments

Comments
 (0)