1:- module(bc_search, [
    2    bc_search/4,            % +Type, +Language, +Query, -Results
    3    bc_index/1,             % +Id
    4    bc_index_remove/1,      % +Id
    5    bc_index_remove/0,      % +Id
    6    bc_index_all/0,
    7    bc_index_clean/0,
    8    bc_cosine_similarity/3, % +Id1, +Id2, -Cosine
    9    bc_add_stopword/1,      % +Word
   10    bc_terms/1              % -List
   11]).

Search support */

   15:- use_module(library(error)).   16:- use_module(library(debug)).   17:- use_module(library(docstore)).   18:- use_module(library(sort_dict)).   19:- use_module(library(dcg/basics)).   20:- use_module(library(porter_stem)).   21
   22:- use_module(bc_excerpt).   23
   24% Reverse index for content
   25% tokens.
   26
   27:- dynamic(content_index/4).   28:- dynamic(indexed/1).   29:- dynamic(term/1).   30:- dynamic(term_idf/2).   31:- dynamic(entry_tfidf/4).   32:- dynamic(stopword/1).

bc_index_clean is det: Cleans current index data. Whole index must be rebuilt afterwards.

   40bc_index_clean:-
   41    with_mutex(bc_index,
   42        index_clean_unsafe).
   43
   44index_clean_unsafe:-
   45    debug(bc_index, 'cleaning index', []),
   46    retractall(content_index(_, _, _, _)),
   47    retractall(indexed(_)),
   48    retractall(term(_)),
   49    retractall(term_idf(_, _)),
   50    retractall(entry_term_tfidf(_, _, _, _)).
   51
   52bc_terms(List):-
   53    findall(Idf-Term, term_idf(Term, Idf), Pairs),
   54    sort(Pairs, List).
   55
   56bc_add_stopword(Word):-
   57    must_be(atom, Word),
   58    (   stopword(Word)
   59    ->  true
   60    ;   assertz(stopword(Word))).
   61
   62:- bc_add_stopword(the).   63:- bc_add_stopword(to).   64:- bc_add_stopword(for).   65:- bc_add_stopword(in).   66:- bc_add_stopword(is).   67:- bc_add_stopword(it).   68:- bc_add_stopword(about).   69:- bc_add_stopword(have).   70:- bc_add_stopword(many).   71:- bc_add_stopword(other).   72:- bc_add_stopword(some).   73:- bc_add_stopword(that).   74:- bc_add_stopword(them).   75:- bc_add_stopword(this).   76:- bc_add_stopword(when).   77
   78% Calculates cosine similarity
   79% between the two given documents.
   80% See also:
   81% https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
   82
   83bc_cosine_similarity(Id1, Id2, Cosine):-
   84    must_be(atom, Id1),
   85    must_be(atom, Id2),
   86    entry_tfidf(Id1, Vector1, Norm1, NonZero1),
   87    score_vector(Vector1, Norm1, NonZero1, Id2, Cosine).
   88
   89score_vector(Vector1, Norm1, NonZero1, Id, Cosine):-
   90    entry_tfidf(Id, Vector2, Norm2, NonZero2),
   91    ord_intersection(NonZero1, NonZero2, NonZero),
   92    dot_product(NonZero, 0, Vector1, Vector2, Product),
   93    Cosine is Product / (Norm1 * Norm2).
   94
   95% Calculates dot product between the
   96% two given documents.
   97
   98dot_product([Index|Indices], Acc, Vector1, Vector2, Product):-
   99    arg(Index, Vector1, TfIdf1),
  100    arg(Index, Vector2, TfIdf2),
  101    Tmp is TfIdf1 * TfIdf2 + Acc,
  102    dot_product(Indices, Tmp, Vector1, Vector2, Product).
  103
  104dot_product([], Acc, _, _, Acc).
  105
  106term_freq(Id, Term, Freq):-
  107    (   content_index(Term, Id, _, Freq)
  108    ->  true
  109    ;   Freq = 0).
  110
  111% Rebuilds IDF values for terms.
  112
  113term_idf_rebuild:-
  114    debug(bc_search, 'rebuilding IDF factor index', []),
  115    retractall(term_idf(_, _)),
  116    findall(Term, term(Term), Terms),
  117    findall(_, indexed(_), Docs),
  118    length(Docs, Count),
  119    maplist(add_term_idf(Count), Terms).
  120
  121% Calculates and updates the
  122% IDF value for the given term.
  123
  124add_term_idf(Count, Term):-
  125    findall(_, content_index(Term, _, _, _), Docs),
  126    length(Docs, NumDocs),
  127    (   NumDocs = 0
  128    ->  Idf = 1
  129    ;   Idf is 1 + log(Count/NumDocs)),
  130    assertz(term_idf(Term, Idf)).
  131
  132% Rebuilds all TF*IDF vectors.
  133
  134entry_tfidf_rebuild_all:-
  135    debug(bc_search, 'rebuilding all TF*IDF vectors', []),
  136    findall(Id, indexed(Id), Ids),
  137    maplist(entry_tfidf_rebuild, Ids).
  138
  139% Rebuilds precalculated entry TF*IDF vector.
  140
  141entry_tfidf_rebuild(Id):-
  142    debug(bc_search, 'rebuilding TF*IDF vector for ~w', [Id]),
  143    retractall(entry_tfidf(Id, _, _, _)),
  144    findall(Term-Idf, term_idf(Term, Idf), IdfsPairs),
  145    maplist(entry_term_tfidf(Id), IdfsPairs, Items),
  146    items_tfidf_vector_norm(Items, Vector, Norm, NonZero),
  147    assertz(entry_tfidf(Id, Vector, Norm, NonZero)).
  148
  149items_tfidf_vector_norm(Items, Vector, Norm, NonZero):-
  150    Vector =.. [tfid|Items],
  151    findall(Index, (
  152        arg(Index, Vector, Value), Value > 0),
  153        NonZero),
  154    tfidf_vector_norm(NonZero, 0, Vector, Norm).
  155
  156% Calculates the TF*IDF value for the given
  157% term in the given entry.
  158
  159entry_term_tfidf(Id, Term-Idf, TfIdf):-
  160    term_freq(Id, Term, Freq),
  161    TfIdf is Freq * Idf.
  162
  163% Calculates vector norm from the TF*IDF vector.
  164
  165tfidf_vector_norm([Index|Indices], Acc, Vector, Norm):-
  166    arg(Index, Vector, TfIdf),
  167    Tmp is TfIdf * TfIdf + Acc,
  168    tfidf_vector_norm(Indices, Tmp, Vector, Norm).
  169
  170tfidf_vector_norm([], Sum, _, Norm):-
  171    Norm is sqrt(Sum).

bc_search(Type, Language, Query, Results) is det: Runs search against the given type. Gives back matching results sorted by match score. Each entry is added excerpt from beginning.

  180bc_search(Type, Language, Query, Results):-
  181    split(Query, Language, Tokens),
  182    ds_find(entry, (type=Type, published=true),
  183        [slug, tags, title, author, date_published,
  184        date_updated, description, language], Entries),
  185    query_tfidf_norm(Tokens, Vector, Norm, NonZero),
  186    (   NonZero = []
  187    ->  Results = []
  188    ;   maplist(score_entry(Vector, Norm, NonZero), Entries, Scored),
  189        include(above_zero, Scored, Filtered),
  190        sort_dict(search_score, desc, Filtered, Sorted),
  191        maplist(add_excerpt, Sorted, WithExcerpt),
  192        maplist(add_author, WithExcerpt, Results)).
  193
  194query_tfidf_norm(Tokens, Vector, Norm, NonZero):-
  195    findall(Term-Idf, term_idf(Term, Idf), IdfsPairs),
  196    maplist(query_term_tfidf(Tokens), IdfsPairs, Items),
  197    items_tfidf_vector_norm(Items, Vector, Norm, NonZero).
  198
  199% Helper to turn query tokens
  200% into a vector of TF*IDF values.
  201
  202query_term_tfidf(Tokens, Term-Idf, TfIdf):-
  203    (   memberchk(Term, Tokens)
  204    ->  TfIdf = Idf
  205    ;   TfIdf = 0).
  206
  207above_zero(Entry):-
  208    Entry.search_score > 0.
  209
  210add_excerpt(Entry, WithExcerpt):-
  211    ds_id(Entry, Id),
  212    ds_col_get(entry, Id, [html], Excerpt),
  213    bc_excerpt(Excerpt.html, 200, Text),
  214    WithExcerpt = Entry.put(excerpt, Text).
  215
  216add_author(Entry, WithAuthor):-
  217    ds_col_get(user, Entry.author, [fullname, link], Author),
  218    WithAuthor = Entry.put(author, Author).
  219
  220% Attaches search_score key to entry
  221% based on token relevancy. Sets search_score
  222% to -1 when no token matches.
  223
  224score_entry(Vector, Norm, NonZero, Entry, WithScore):-
  225    ds_id(Entry, Id),
  226    score_vector(Vector, Norm, NonZero, Id, Score),
  227    WithScore = Entry.put(search_score, Score).

bc_index_all is det: Indexes all entries.

  233bc_index_all:-
  234    debug(bc_search, 'indexing all entries', []),
  235    with_mutex(bc_index, (
  236        ds_all(entry, [], Entries),
  237        maplist(index_entry, Entries),
  238        term_idf_rebuild,
  239        entry_tfidf_rebuild_all)).
  240
  241index_entry(Entry):-
  242    ds_id(Entry, Id),
  243    index_unsafe(Id).

bc_index(+Id, +Content) is det: Indexes the given entry. Should be called when the entry contents is updated.

  251bc_index(Id):-
  252    must_be(atom, Id),
  253    with_mutex(bc_index, (
  254        index_unsafe(Id),
  255        term_idf_rebuild,
  256        entry_tfidf_rebuild(Id))).
  257
  258index_unsafe(Id):-
  259    debug(bc_search, 'indexing ~w', [Id]),
  260    retractall(content_index(_, Id, _, _)),
  261    retractall(indexed(Id)),
  262    ds_col_get(entry, Id,
  263        [content, tags, title, slug, language], Entry),
  264    atom_string(Language, Entry.language),
  265    index_content(Id, Entry.content, Language),
  266    index_tags(Id, Entry.tags, Language),
  267    index_title(Id, Entry.title, Language),
  268    index_slug(Id, Entry.slug, Language),
  269    assertz(indexed(Id)).
  270
  271index_content(Id, Content, Language):-
  272    split(Content, Language, Tokens),
  273    length(Tokens, Length),
  274    maplist(add_content_token(Id, Length), Tokens).
  275
  276index_tags(Id, Tags, Language):-
  277    atomic_list_concat(Tags, ' ', Concat),
  278    split(Concat, Language, Tokens),
  279    maplist(add_tag_token(Id), Tokens).
  280
  281index_title(Id, Title, Language):-
  282    split(Title, Language, Tokens),
  283    maplist(add_tag_token(Id), Tokens).
  284
  285index_slug(Id, Slug, Language):-
  286    split(Slug, Language, Tokens),
  287    maplist(add_tag_token(Id), Tokens).
  288
  289% Adds tag token. Tag token has
  290% relative weight 1.
  291
  292add_tag_token(Id, Token):-
  293    add_term(Token),
  294    retractall(content_index(Token, Id, _, _)),
  295    assertz(content_index(Token, Id, 1, 1)).

bc_index_remove(+Id) is det: Removes the given entry index.

  302bc_index_remove(Id):-
  303    must_be(atom, Id),
  304    with_mutex(bc_index,
  305        retractall(content_index(_, Id, _, _))).

bc_index_remove is det: Removes all indexes.

  311bc_index_remove:-
  312    with_mutex(bc_index,
  313        retractall(content_index(_, _, _, _))).
  314
  315% Adds token to the index.
  316% Recalculates relative historgram.
  317
  318add_content_token(Id, Length, Token):-
  319    add_term(Token),
  320    (   content_index(Token, Id, Count, _)
  321    ->  retractall(content_index(Token, Id, _, _)),
  322        NewCount is Count + 1,
  323        NewRel is NewCount / Length,
  324        assertz(content_index(Token, Id, NewCount, NewRel))
  325    ;   NewRel is 1/Length,
  326        assertz(content_index(Token, Id, 1, NewRel))).
  327
  328% Helper to add token. Only
  329% adds when it does not exist yet.
  330
  331add_term(Token):-
  332    (   term(Token)
  333    ->  true
  334    ;   assertz(term(Token))).
  335
  336% Stems the given term. Non-english entries will not
  337% have stemmed terms.
  338
  339stem_term(en, Term, Stemmed):- !,
  340    catch(porter_stem(Term, Stemmed), Error, true),
  341    (   var(Error)
  342    ->  true
  343    ;   Stemmed = Term).
  344
  345stem_term(_, Term, Term).
  346
  347split(Text, Language, Stemmed):-
  348    must_be(atom, Language),
  349    atom_codes(Text, Codes),
  350    split(Codes, [], [], Tokens),
  351    exclude(unused_token, Tokens, Filtered),
  352    maplist(stem_term(Language), Filtered, Stemmed).
  353
  354% Tokens with length < 2 and stopwords are
  355% not used.
  356
  357unused_token(Token):-
  358    atom_length(Token, Length),
  359    Length < 2, !.
  360
  361unused_token(Token):-
  362    stopword(Token).
  363
  364% Splits list of codes into a list
  365% of tokens (atoms).
  366
  367split([Code|Codes], Context, Acc, Tokens):-
  368    (   split_at([Code|Codes], Context)
  369    ->  reverse(Acc, Token),
  370        atom_codes(Atom, Token),
  371        downcase_atom(Atom, Lower),
  372        Tokens = [Lower|Rest],
  373        split(Codes, [Code|Context], [], Rest)
  374    ;   split(Codes, [Code|Context], [Code|Acc], Tokens)).
  375
  376split([], _, Acc, [Lower]):-
  377    reverse(Acc, Token),
  378    atom_codes(Atom, Token),
  379    downcase_atom(Atom, Lower).
  380
  381% Whether to split at current code or not.
  382
  383% Preserves inter-word dot.
  384
  385split_at([46,Code|_], _):-
  386    code_type(Code, digit), !,
  387    fail.
  388
  389split_at([Code|_], _):-
  390    \+ code_type(Code, alnum)