Skip to content

Commit

Permalink
Fix issue with stemmer and non-english languages. Bump version. Updat…
Browse files Browse the repository at this point in the history
…e deps.
  • Loading branch information
rla committed Jan 23, 2021
1 parent 62812c4 commit 8bcbe83
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 41 deletions.
2 changes: 1 addition & 1 deletion pack.pl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name('blog_core').
version('1.5.2').
version('1.5.3').
title('Blog/CMS framework').
author('Raivo Laanemets', 'https://rlaanemets.com/').
home('http://blog-core.net/').
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "blog-core",
"version": "1.5.1",
"version": "1.5.3",
"private": true,
"devDependencies": {
"@babel/core": "^7.2.0",
Expand Down
5 changes: 3 additions & 2 deletions prolog/bc/bc_api_search.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
search(Type):-
http_current_request(Request),
http_parameters(Request, [
q(Query, [atom, default('')])]),
bc_search(Type, Query, Results),
q(Query, [atom, default('')]),
l(Language, [atom, default(en)])]),
bc_search(Type, Language, Query, Results),
bc_reply_success(Results).
4 changes: 2 additions & 2 deletions prolog/bc/bc_dep.pl
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
bc_check_dependencies:-
check_swi,
check_installed(smtp, v(0, 9, 4)),
check_installed(docstore, v(2, 0, 1)),
check_installed(arouter, v(1, 1, 1)),
check_installed(docstore, v(2, 0, 2)),
check_installed(arouter, v(2, 0, 0)),
check_installed(simple_template, v(1, 0, 1)),
check_installed(sort_dict, v(0, 0, 3)),
check_installed(dict_schema, v(0, 0, 2)),
Expand Down
98 changes: 63 additions & 35 deletions prolog/bc/bc_search.pl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
:- module(bc_search, [
bc_search/3,
bc_search/4, % +Type, +Language, +Query, -Results
bc_index/1, % +Id
bc_index_remove/1, % +Id
bc_index_remove/0, % +Id
Expand Down Expand Up @@ -170,15 +170,15 @@
tfidf_vector_norm([], Sum, _, Norm):-
Norm is sqrt(Sum).

%! bc_search(Type, Query, Results) is det.
%! bc_search(Type, Language, Query, Results) is det.
%
% Runs search against the given type.
% Gives back matching results sorted
% by match score. Each entry is added
% excerpt from beginning.

bc_search(Type, Query, Results):-
split(Query, Tokens),
bc_search(Type, Language, Query, Results):-
split(Query, Language, Tokens),
ds_find(entry, (type=Type, published=true),
[slug, tags, title, author, date_published,
date_updated, description, language], Entries),
Expand Down Expand Up @@ -260,25 +260,38 @@
retractall(content_index(_, Id, _, _)),
retractall(indexed(Id)),
ds_col_get(entry, Id,
[content, tags, title, slug], Entry),
split(Entry.content, Tokens),
length(Tokens, Length),
maplist(add_token(Id, Length), Tokens),
maplist(add_tag_token(Id), Entry.tags),
split(Entry.title, TitleTokens),
maplist(add_tag_token(Id), TitleTokens),
split(Entry.slug, SlugTokens),
maplist(add_tag_token(Id), SlugTokens),
[content, tags, title, slug, language], Entry),
index_content(Id, Entry.content, Entry.language),
index_tags(Id, Entry.tags, Entry.language),
index_title(Id, Entry.title, Entry.language),
index_slug(Id, Entry.slug, Entry.language),
assertz(indexed(Id)).

index_content(Id, Content, Language):-
split(Content, Language, Tokens),
length(Tokens, Length),
maplist(add_content_token(Id, Length), Tokens).

index_tags(Id, Tags, Language):-
atomic_list_concat(Tags, ' ', Concat),
split(Concat, Language, Tokens),
maplist(add_tag_token(Id), Tokens).

index_title(Id, Title, Language):-
split(Title, Language, Tokens),
maplist(add_tag_token(Id), Tokens).

index_slug(Id, Slug, Language):-
split(Slug, Language, Tokens),
maplist(add_tag_token(Id), Tokens).

% Adds tag token. Tag token has
% relative weight 1.

add_tag_token(Id, Tag):-
porter_stem(Tag, Stemmed),
add_term(Stemmed),
retractall(content_index(Stemmed, Id, _, _)),
assertz(content_index(Stemmed, Id, 1, 1)).
add_tag_token(Id, Token):-
add_term(Token),
retractall(content_index(Token, Id, _, _)),
assertz(content_index(Token, Id, 1, 1)).

%! bc_index_remove(+Id) is det.
%
Expand All @@ -301,17 +314,15 @@
% Adds token to the index.
% Recalculates relative historgram.

add_token(Id, Length, Token):-
( stopword(Token)
-> true
; add_term(Token),
( content_index(Token, Id, Count, _)
-> retractall(content_index(Token, Id, _, _)),
NewCount is Count + 1,
NewRel is NewCount / Length,
assertz(content_index(Token, Id, NewCount, NewRel))
; NewRel is 1/Length,
assertz(content_index(Token, Id, 1, NewRel)))).
add_content_token(Id, Length, Token):-
add_term(Token),
( content_index(Token, Id, Count, _)
-> retractall(content_index(Token, Id, _, _)),
NewCount is Count + 1,
NewRel is NewCount / Length,
assertz(content_index(Token, Id, NewCount, NewRel))
; NewRel is 1/Length,
assertz(content_index(Token, Id, 1, NewRel))).

% Helper to add token. Only
% adds when it does not exist yet.
Expand All @@ -321,15 +332,32 @@
-> true
; assertz(term(Token))).

split(Text, Stemmed):-
% Stems the given term. Non-english entries will not
% have stemmed terms.

stem_term(en, Term, Stemmed):- !,
catch(porter_stem(Term, Stemmed), Error, true),
( var(Error)
-> true
; Stemmed = Term).

stem_term(_, Term, Term).

split(Text, Language, Stemmed):-
atom_codes(Text, Codes),
split(Codes, [], [], Tokens),
exclude(empty_token, Tokens, Filtered),
maplist(porter_stem, Filtered, Stemmed).
exclude(unused_token, Tokens, Filtered),
maplist(stem_term(Language), Filtered, Stemmed).

% Tokens with length < 2 and stopwords are
% not used.

empty_token(Token):-
unused_token(Token):-
atom_length(Token, Length),
Length < 2.
Length < 2, !.

unused_token(Token):-
stopword(Token).

% Splits list of codes into a list
% of tokens (atoms).
Expand All @@ -352,7 +380,7 @@

% Preserves inter-word dot.

split_at([0'.,Code|_], _):-
split_at([46,Code|_], _):-
code_type(Code, digit), !,
fail.

Expand Down
5 changes: 5 additions & 0 deletions tests/api_entry.pl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
new_post(AuthorId, test_post, _{ title: "" }, Post),
assertion(is_invalid_data(Post)).

test('New entry, contains non-latin1', [setup(new_database)]):-
default_user_id(AuthorId),
new_post(AuthorId, test_post, _{ title: "Testő" }, Post),
assertion(Post.status = "success").

test('New entry, no authentication', [setup(new_database)]):-
default_user_id(AuthorId),
set_no_auth,
Expand Down

0 comments on commit 8bcbe83

Please sign in to comment.