1:- module(regex, [ (=~)/2
    2                 , (\~)/2
    3                 , op(700,xfx,=~)
    4                 , op(700,xfx,\~)
    5                 , regex/4
    6                 ]).    7:- use_module(library(error), [domain_error/2]).    8:- use_module(library(regex/state), [new_state/3, numbered_captures/2]).    9:- use_module(library(regex/parser), [re//2]).   10:- use_module(library(regex/engine/pp), [engine_match/5]).   11
   12
   13% operators for matching strings against regular expressions.
   14% the syntax is the same used by Perl and Haskell, but Prolog
   15% doesn't like '!' in operators so I had to use '\' instead.
   16:- op(700,xfx,=~).   17:- op(700,xfx,\~).
 =~(+Text, +Pattern) is semidet
True if Text matches regular expression Pattern. Only the first match is considered. Text and Pattern can be atoms or code lists.

Named captures are automatically bound to corresponding named variables in the surrounding scope. For example,

"Hi John" =~ "hi (?<Name>[a-z]+)"/i,
Name == "John".
   30Text =~ Pattern :-
   31    expand_equalstilde(Text =~ Pattern, _, Goal),
   32    call(Goal).
   33
   34expand_equalstilde(Text =~ Pattern, Vars, regex(P,Options,Text,Vars)) :-
   35    ( nonvar(Pattern), Pattern = P/Options ->
   36        true
   37    ; % no explicit options ->
   38        P = Pattern,
   39        Options = []
   40    ).
   41
   42% macro expansion giving access to in-scope variables.
   43user:goal_expansion(Text =~ Pattern, Goal) :-
   44    % is goal expansion wanted?
   45    prolog_load_context(module, Module),
   46    Module \== regex,  % we don't want string interpolation ourselves
   47    predicate_property(Module:(_=~_),imported_from(regex)),
   48
   49    prolog_load_context(variable_names, Vars),
   50    expand_equalstilde(Text =~ Pattern, Vars, Goal).
 \~(+Text, +Pattern) is semidet
Like `\+ Text =~ Pattern`.
   56Text \~ Pattern :-
   57    \+ Text =~ Pattern.
 regex(+Pattern:text, +Options, +Text:text, ?Captures:list) is semidet
True if Text matches the regular expression Pattern. The pattern's behavior is influenced by Options (see below). The values of any capturing subgroups are unified with Captures (see below). A text value may either be an atom or a list of codes.

Options can either be an atom or a list of options. If an atom, it's split into a list of single character atoms which is used as the Options value. This allows on to use is, for example, instead of [i,s]. Acceptable options are:

Captures is unified with a list of captured values, with the leftmost capture first, etc. Each captured value is a list of codes. For example,

?- regex('(a+)(b*)', [], 'aaabbbbb', [A,B]).
A = "aaa",
B = "bbbbb".

Named captures are also supported. In that case, Captures must be a list of pairs like ['A'=A,'B'=B]. Every named capture in the pattern must have a corresponding key in Captures. (This is a temporary restriction and will be removed later).

A brief word on argument order. Prolog convention prefers to place an Options argument as the final argument or as the last one before outputs. However, widely followed regular expression convention places options immediately after the pattern. I chose to follow the latter convention. This argument order benefits higher-order calls like maplist/3 which can do things like:

?- maplist(regex('(a+)(b+)', i), [ab, aab, abb], L).
L = [["a", "b"], ["aa", "b"], ["a", "bb"]].
   98regex(Pattern,Options,Text,Captures) :-
   99    % normalize text representations
  100    text_codes(Text, T),
  101    text_codes(Pattern, P0),
  102    starts_with_caret(P0,P,StartingCaret),
  103
  104    % normalize options and captures into a state value
  105    new_state(Options, Captures, State0),
  106
  107    % compile Pattern
  108    ( phrase(re(State0,Re),P) ->
  109        ( StartingCaret=yes ->
  110            once(engine_match(Re, State0, State, T, _))
  111        ; otherwise ->
  112            once(regex_no_sugar(Re, State0, State, T, _))
  113        ),
  114        ( var(Captures) ->
  115            numbered_captures(State, Captures)
  116        ; % captures already bound ->
  117            true
  118        )
  119    ; % invalid pattern ->
  120        atom_codes(A, P),
  121        domain_error(regex, A)
  122    ).
  123
  124
  125starts_with_caret([0'^|P],P,yes) :- % ' syntax highlighter
  126    !.
  127starts_with_caret(P,P,no).
  128
  129
  130% the heart and soul of regex/4
  131regex_no_sugar(Re, State0, State) -->
  132    engine_match(Re, State0, State).
  133regex_no_sugar(Re, State0, State) -->
  134    [_],
  135    regex_no_sugar(Re, State0, State).
 text_codes(+Text, -Codes)
Convert Text (atom or codes) into Codes.
  141text_codes(Atom, Codes) :-
  142    atom(Atom),
  143    !,
  144    atom_codes(Atom, Codes).
  145text_codes(String, Codes) :-
  146    string(String),
  147    !,
  148    string_codes(String, Codes).
  149text_codes(Codes, Codes)