View source with raw comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker and Peter Ludemann
    4    E-mail:        jan@swi-prolog.org
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017-2023, VU University Amsterdam
    7                              SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(pcre,
   37          [ re_match/2,           % +Regex, +String
   38            re_match/3,           % +Regex, +String, +Options
   39            re_matchsub/3,        % +Regex, +String, -Subs
   40            re_matchsub/4,        % +Regex, +String, -Subs, +Options
   41            re_foldl/6,           % :Goal, +Regex, +String, ?V0, ?V, +Options
   42            re_split/3,           % +Pattern, +String, -Split:list
   43            re_split/4,           % +Pattern, +String, -Split:list, +Options
   44            re_replace/4,         % +Pattern, +With, +String, -NewString
   45            re_replace/5,         % +Pattern, +With, +String, -NewString, +Options
   46            re_compile/3,         % +Pattern, -Regex, +Options
   47            re_flush/0,
   48            re_config/1           % ?Config
   49          ]).   50:- autoload(library(apply), [maplist/2, maplist/3]).   51:- autoload(library(error), [must_be/2, existence_error/2]).   52:- autoload(library(dcg/basics), [eos/2, digit/3, digits/3]).   53:- autoload(library(lists), [append/3]).   54
   55:- use_foreign_library(foreign(pcre4pl)).   56
   57:- meta_predicate
   58    re_foldl(3, +, +, ?, ?, +).

Perl compatible regular expression matching for SWI-Prolog

This module provides an interface to the PCRE2 (Perl Compatible Regular Expression) library. This Prolog interface provides an almost complete wrapper around PCRE2 (the successor to PCRE) with as much backward compatibility to PCRE as possible, because the original implementation was for PCRE (also known as PCRE1).

Regular expressions are created from a pattern and options and represented as a SWI-Prolog blob. This implies they are subject to (atom) garbage collection. Compiled regular expressions can safely be used in multiple threads. Most predicates accept both an explicitly compiled regular expression, a pattern, or a term Pattern/Flags. The semantics of the pattern can be additionally modified by options. In the latter two cases a regular expression blob is created and stored in a cache. The cache can be cleared using re_flush/0.

Most of the predicates in this library take both a regular expression represented as a string with optional flags, e.g., 'aap'/i or a compiled regular expression. If a string (+flags) alternative is used, the library maintains a cache of compiled regular expressions. See also re_flush/0. The library can be asked to rewrite the re_match/2 and re_match/3 goals to use inlined compiled regular expression objects using

:- set_prolog_flag(re_compile, true).

This has some consequences:

See also
- `man pcre2api` or https://www.pcre.org/current/doc/html/pcre2api.html for details of the PCRE2 syntax and options. */
   99:- predicate_options(re_match/3, 3,
  100                     [ start(integer), % Not part of pcre2 API
  101                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
  102                       anchored(boolean),    % Also re_compile/3
  103                       utf_check(boolean),   % Also re_compile/3
  104                       endanchored(boolean), % Also re_compile/3
  105                       bol(boolean),
  106                       eol(boolean),
  107                       empty(boolean),
  108                       empty_atstart(boolean),
  109                       partial_soft(boolean),
  110                       partial_hard(boolean),
  111                       % dfa_restart(boolean),  % TODO: if pcre2_dfa_match() is supported
  112                       % dfa_shortest(boolean), % TODO: if pcre2_dfa_match() is supported
  113                       jit(boolean),
  114                       copy_matched_subject(boolean)
  115                     ]).  116:- predicate_options(re_compile/3, 3,
  117                     [ capture_type(oneof([atom,string,range])), % Not part of pcre2 API
  118                       % These are in the same order as in pcre4pl.c, to make it easy to compare them
  119                       anchored(boolean),    % Also re_match/3
  120                       utf_check(boolean),   % Also re_match/3
  121                       endanchored(boolean), % Also re_match/3
  122                       allow_empty_class(boolean),
  123                       alt_bsux(boolean),
  124                       auto_callout(boolean),
  125                       caseless(boolean),
  126                       dollar_endonly(boolean),
  127                       dotall(boolean),
  128                       dupnames(boolean),
  129                       extended(boolean),
  130                       firstline(boolean),
  131                       match_unset_backref(boolean),
  132                       multiline(boolean),
  133                       never_ucp(boolean),
  134                       never_utf(boolean),
  135                       auto_capture(boolean),
  136                       no_auto_capture(boolean), % backwards compatibility
  137                       auto_possess(boolean),
  138                       dotstar_anchor(boolean),
  139                       start_optimize(boolean),
  140                       ucp(boolean),
  141                       greedy(boolean),
  142                       ungreedy(boolean), % Backwards compatibility
  143                       utf(boolean),
  144                       never_backslash_c(boolean),
  145                       alt_circumflex(boolean),
  146                       alt_verbnames(boolean),
  147                       use_offset_limit(boolean),
  148                       extended_more(boolean),
  149                       literal(boolean),
  150                       match_invalid_utf(boolean),
  151                       jit_complete(boolean),
  152                       jit_partial_soft(boolean),
  153                       jit_partial_hard(boolean),
  154                       jit_invalid_utf(boolean),
  155                       bsr(oneof([anycrlf,unicode])),
  156                       bsr2(oneof([anycrlf,unicode])),
  157                       compat(oneof([])), % Obsolete
  158                       newline(oneof([any,anycrlf,cr,lf,crlf,nul])),
  159                       newline2(oneof([any,anycrlf,cr,lf,crlf,nul]))
  160                     ]).  161:- predicate_options(re_matchsub/4, 4,
  162                     [ pass_to(re_match/3, 3)
  163                     ]).  164:- predicate_options(re_foldl/6, 6,
  165                     [ pass_to(re_match/3, 3)
  166                     ]).  167:- predicate_options(re_split/4, 4,
  168                     [ pass_to(re_match/3, 3)
  169                     ]).  170:- predicate_options(re_replace/5, 5,
  171                     [ pass_to(re_match/3, 3)
  172                     ]).
 re_match(+Regex, +String) is semidet
 re_match(+Regex, +String, +Options) is semidet
Succeeds if String matches Regex. For example:
?- re_match("^needle"/i, "Needle in a haystack").
true.

Defined Options are given below. For details, see the PCRE documentation. If an option is repeated, the first value is used and subsequent values are ignored. Unrecognized options are ignored. Unless otherwise specified, boolean options default to false.

If Regex is a text pattern (optionally with flags), then any of the Options for re_compile/3 can be used, in addition to the Options listed below. If Regex is the result of re_compile/3, then only the following execution-time Options are recognized and any others are ignored. Some options may not exist on your system, depending on the PCRE2 version and how it was built - these unsupported options are silently ignored.

Arguments:
Regex- is the output of re_compile/3, a pattern or a term Pattern/Flags, where Pattern is an atom or string. The defined flags and their related option for re_compile/3 are below.
  • x: extended(true)
  • i: caseless(true)
  • m: multiline(true)
  • s: dotall(true)
  • a: capture_type(atom)
  • r: capture_type(range)
  • t: capture_type(term)

If Regex is the output of re_compile/3, any compile-time options in Options or Flags are ignored and only match-time options are used.

The options that are derived from flags take precedence over the options in the Options list. In the case of conflicting flags, the first one is used (e.g., ra results in capture_type(range)).

  249re_match(Regex, String) :-
  250    re_match(Regex, String, []).
  251re_match(Regex, String, Options) :-
  252    re_compiled(Regex, Compiled, Options),
  253    re_match_(Compiled, String, Options).
 re_matchsub(+Regex, +String, -Sub:dict) is semidet
 re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet
Match String against Regex. On success, Sub is a dict containing integer keys for the numbered capture group and atom keys for the named capture groups. The entire match string has the key 0. The associated value is determined by the capture_type(Type) option passed to re_compile/3, or by flags if Regex is of the form Pattern/Flags; and may be specified at the level of individual captures using a naming convention for the caption name. See re_compile/3 for details.

The example below exploits the typed groups to parse a date specification:

?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
                (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
               "2017-04-20", Sub, []).
Sub = re_match{0:"2017-04-20", date:"2017-04-20",
               day:20, month:4, year:2017}.
Arguments:
Both- compilation and execution options are processed. See re_compile/3 and re_match/3 for the set of options. In addition, some compilation options may passed as /Flags to Regex - see re_match/3 for the list of flags.
Regex- See re_match/2 for a description of this argument.
  286re_matchsub(Regex, String, Subs) :-
  287    re_matchsub(Regex, String, Subs, []).
  288
  289re_matchsub(Regex, String, Subs, Options) :-
  290    re_compiled(Regex, Compiled, Options),
  291    re_matchsub_(Compiled, String, Pairs, Options),
  292    dict_pairs(Subs, re_match, Pairs).
 re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet
Fold all matches of Regex on String. Each match is represented by a dict as specified for re_matchsub/4. V0 and V are related using a sequence of invocations of Goal as illustrated below.
call(Goal, Dict1, V0, V1),
call(Goal, Dict2, V1, V2),
...
call(Goal, Dictn, Vn, V).

This predicate is used to implement re_split/4 and re_replace/4. For example, we can count all matches of a Regex on String using this code:

re_match_count(Regex, String, Count) :-
    re_foldl(increment, Regex, String, 0, Count, []).

increment(_Match, V0, V1) :-
    V1 is V0+1.

After which we can query

?- re_match_count("a", "aap", X).
X = 2.

Here is an example Goal for extracting all the matches with their offsets within the string:

range_match(Dict, StringIndex-[MatchStart-Substring|List], StringIndex-List) :-
    Dict.(StringIndex.index) = MatchStart-MatchLen,
    sub_string(StringIndex.string, MatchStart, MatchLen, _, Substring).

And can be used with this query (note the capture_type(range) option, which is needed by range_match/3, and greedy(false) to invert the meaning of *?):

?- String = "{START} Mary {END} had a {START} little lamb {END}",
   re_foldl(range_match,
            "{START} *?(?<piece>.*) *?{END}",
            String, _{string:String,index:piece}-Matches, _-[],
            [capture_type(range),greedy(false)]).
Matches = [8-"Mary", 33-"little lamb"].
  345re_foldl(Goal, Regex, String, V0, V, Options) :-
  346    re_compiled(Regex, Compiled, Options),
  347    re_foldl_(Compiled, String, Goal, V0, V, Options).
  348
  349:- public re_call_folder/4. % prevent code obfusication name mangling
  350:- meta_predicate re_call_folder(2, +, ?, ?).  351
  352%   re_call_folder(:Goal, +Pairs, ?V0, ?V1).
  353%   Used by re_foldl_/6 to call Goal with a dict.
  354%     DO NOT use "%!" comment - that would add it to the docs
  355re_call_folder(Goal, Pairs, V0, V1) :-
  356    dict_pairs(Dict, re_match, Pairs),
  357    call(Goal, Dict, V0, V1).
 re_split(+Pattern, +String, -Splits:list) is det
 re_split(+Pattern, +String, -Splits:list, +Options) is det
Split String using the regular expression Pattern. Splits is a list of strings holding alternating matches of Pattern and skipped parts of the String, starting with a skipped part. The Splits lists ends with a string of the content of String after the last match. If Pattern does not appear in String, Splits is a list holding a copy of String. This implies the number of elements in Splits is always odd. For example:
?- re_split("a+", "abaac", Splits, []).
Splits = ["","a","b","aa","c"].
?- re_split(":\\s*"/n, "Age: 33", Splits, []).
Splits = ['Age', ': ', 33].
Arguments:
Pattern- is the pattern text, optionally follows by /Flags. Similar to re_matchsub/4, the final output type can be controlled by a flag a (atom), s (string, default) or n (number if possible, atom otherwise).
  383re_split(Pattern, String, Splits) :-
  384    re_split(Pattern, String, Splits, []).
  385re_split(Pattern, String, Splits, Options) :-
  386    split_range_regex(Pattern, Compiled, Type, Options),
  387    State = state(String, 0, Type),
  388    re_foldl(split(State), Compiled, String, Splits, [Last], Options),
  389    arg(2, State, LastSkipStart),
  390    typed_sub(Type, String, LastSkipStart, _, 0, Last).
  391
  392split_range_regex(Pattern/Flags, Compiled, Type, Options) =>
  393    split_range_regex(Pattern, Flags, Compiled, Type, Options).
  394split_range_regex(Pattern, Compiled, Type, Options) =>
  395    split_range_regex(Pattern, '', Compiled, Type, Options).
  396
  397split_range_regex(Pattern, Flags, Compiled, Type, Options) =>
  398    regex_capture_type_flag_chars(Flags, Chars, Options),
  399    split_flags(Chars, Chars1, Type),
  400    atom_chars(RFlags, [r|Chars1]),
  401    re_flags_options(RFlags, ROptions),
  402    append(ROptions, Options, Options2),
  403    re_compiled(Pattern/RFlags, Compiled, Options2).
  404
  405split_flags([], [], Type) :-
  406    default(Type, string).
  407split_flags([H|T0], T, Type) :-
  408    split_type(H, Type),
  409    !,
  410    split_flags(T0, T, Type).
  411split_flags([H|T0], [H|T], Type) :-
  412    split_flags(T0, T, Type).
  413
  414split_type(a, atom).
  415split_type(s, string).
  416split_type(n, name).
  417
  418split(State, Dict, [Skipped,Sep|T], T) :-
  419    matched(State, Dict.0, Sep),
  420    skipped(State, Dict.0, Skipped).
  421
  422matched(state(String, _, Type), Start-Len, Matched) :-
  423    typed_sub(Type, String, Start, Len, _, Matched).
  424
  425skipped(State, Start-Len, Skipped) :-
  426    State = state(String, Here, Type),
  427    SkipLen is Start-Here,
  428    typed_sub(Type, String, Here, SkipLen, _, Skipped),
  429    NextSkipStart is Start+Len,
  430    nb_setarg(2, State, NextSkipStart).
  431
  432typed_sub(string, Haystack, B, L, A, String) :-
  433    sub_string(Haystack, B, L, A, String).
  434typed_sub(atom, Haystack, B, L, A, String) :-
  435    sub_atom(Haystack, B, L, A, String).
  436typed_sub(name, Haystack, B, L, A, Value) :-
  437    sub_string(Haystack, B, L, A, String),
  438    (   number_string(Number, String)
  439    ->  Value = Number
  440    ;   atom_string(Value, String)
  441    ).
 re_replace(+Pattern, +With, +String, -NewString) is det
 re_replace(+Pattern, +With, +String, -NewString, +Options) is det
Replace matches of the regular expression Pattern in String with With (possibly containing references to captured substrings).

Throws an error if With uses a name that doesn't exist in the Pattern.

Arguments:
Pattern- is the pattern text, optionally followed by /Flags. Flags may include g, replacing all occurences of Pattern. In addition, similar to re_matchsub/4, the final output type can be controlled by a flag a (atom) or s (string, default). The output type can also be specified by the capture_type option. Capture type suffixes can modify behavior; for example, the following will change an ISO 8601 format date (YYYY-MM-DD) to American style (m/d/y), and also remove leading zeros by using the _I suffix:
re_replace("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
            (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/x,
           "$month-$day-$year",
           ISODate, AmericanDate)`
With- is the replacement text. It may reference captured substrings using \N or $Name. Both N and Name may be written as {N} and {Name} to avoid ambiguities. If a substring is named, it cannot be referenced by its number. The single chracters $ and \ can be escaped by doubling (e.g., re_replace(".","$$","abc",Replaced) results in Replaced="$bc"). (Because \ is an escape character inside strings, you need to write "\\\\" to get a single backslash.)
Options- See re_match/3 for the set of options.

The options that are derived from flags take precedence over the options in the Options list. In the case of conflicting flags, the first one is used (e.g., as results in capture_type(string)). If a capture_type is meaningless (range or term), it is ignored.

  485re_replace(Pattern, With, String, NewString) :-
  486    re_replace(Pattern, With, String, NewString, []).
  487
  488re_replace(Pattern, With, String, NewString, Options) :-
  489    replace_range_regex(Pattern, Compiled, All, Type, Options),
  490    compile_replacement(With, RCompiled),
  491    State = state(String, 0, Type),
  492    (   All == all
  493    ->  re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], [])
  494    ;   (   re_matchsub(Compiled, String, Match, [])
  495        ->  replace(State, RCompiled, Match, Parts, [Last])
  496        ;   Repl = false
  497        )
  498    ),
  499    (   Repl == false
  500    ->  parts_to_output(Type, [String], NewString)
  501    ;   arg(2, State, LastSkipStart),
  502        sub_string(String, LastSkipStart, _, 0, Last),
  503        parts_to_output(Type, Parts, NewString)
  504    ).
  505
  506regex_capture_type_flag_chars(Flags, Chars, Options) :-
  507    atom_chars(Flags, Chars0),
  508    % For replace or split, the capture_type must be range, so if a
  509    % different result is desired, it is specified in the flags. The
  510    % following code converts an Options capture_type to a flag
  511    % character and appends it to the Flags.
  512    (   memberchk(capture_type(T), Options),
  513        type_flag(TFlag, T)
  514    ->  % No need to do delete(Options,capture_type(_),Options2)
  515        % because Flags take precedence and first occurence in Options
  516        % takes precedence.
  517        append(Chars0, [TFlag], Chars)
  518    ;   Chars = Chars0
  519    ).
 replace_range_regex(+Pattern, -Compiled, -All, -Type, +Options) is det
  522replace_range_regex(Pattern/Flags, Compiled, All, Type, Options) =>
  523    replace_range_regex(Pattern, Flags, Compiled, All, Type, Options).
  524replace_range_regex(Pattern, Compiled, All, Type, Options) =>
  525    replace_range_regex(Pattern, '', Compiled, All, Type, Options).
  526
  527replace_range_regex(Pattern, Flags, Compiled, All, Type, Options) =>
  528    regex_capture_type_flag_chars(Flags, Chars, Options),
  529    replace_flags(Chars, Chars1, All, Type),
  530    atom_chars(RFlags, [r|Chars1]),
  531    re_flags_options(RFlags, ROptions),
  532    append(ROptions, Options, Options2),
  533    re_compiled(Pattern, Compiled, Options2).
  534
  535replace_flags([], [], All, Type) :-
  536    default(All, first),
  537    default(Type, string).
  538replace_flags([H|T0], T, All, Type) :-
  539    (   all_flag(H, All)
  540    ->  true
  541    ;   type_flag(H, Type)
  542    ),
  543    !,
  544    replace_flags(T0, T, All, Type).
  545replace_flags([H|T0], [H|T], All, Type) :-
  546    replace_flags(T0, T, All, Type).
  547
  548all_flag(g, all).
  549
  550type_flag(a, atom).
  551type_flag(s, string).
 default(?Val, +Default) is det
If Val isn't instantiated, instantiate it to Default. If Val is already instantiated, succeed. Equivalent to: default( Val, Default), var(Val) => Val = Default. default(_Val, _Default) => true.
  559default(Val, Val) :- !.
  560default(_, _).
  561
  562replace(State, With, Dict, [Skipped|Parts], T) :-
  563    State = state(String, _, _Type),
  564    copy_term(With, r(PartsR, Skel)),
  565    maplist(dict_pair_lookup(Dict), Skel),
  566    range_strings(PartsR, String, Parts, T),
  567    skipped(State, Dict.0, Skipped).
  568
  569% dict_pair_lookup(d{a:1}, a-K) results in K=1.
  570dict_pair_lookup(Dict, Key-Dict.Key).
  571
  572range_strings([], _, T, T).
  573range_strings([Start-Len|T0], String, [S|T1], T) :-
  574    !,
  575    sub_string(String, Start, Len, _, S),
  576    range_strings(T0, String, T1, T).
  577range_strings([S|T0], String, [S|T1], T) :-
  578    range_strings(T0, String, T1, T).
  579
  580parts_to_output(string, Parts, String) :-
  581    atomics_to_string(Parts, String).
  582parts_to_output(atom, Parts, String) :-
  583    atomic_list_concat(Parts, String).
 compile_replacement(+With, -Compiled)
Compile the replacement specification into a specification that can be processed quickly. The compiled expressions are cached and may be reclaimed using re_flush/0 (which also removes compiled Regex from re_compile/3).

This "compilation" has nothing to do with PCRE pattern compilation; it's used by re_replace/5 to process the With argument.

  595:- table compile_replacement/2 as shared.  596
  597compile_replacement(With, r(Parts, Extract)) :-
  598    string_codes(With, Codes),
  599    phrase(replacement_parts(Parts, Pairs), Codes),
  600    % Pairs is LookupKey-Slot pairs, where a LookupKey might be
  601    % duplicated (Slot is a shared variable within Parts).
  602    Extract = Pairs.
  603
  604replacement_parts(Parts, Extract) -->
  605    string_escape(HCodes),
  606    (   ("\\" ; "$"),
  607        capture_name(Name)
  608    ->  !,
  609        { add_part(HCodes, Parts, T0),
  610          T0 = [Repl|T1],
  611          Extract = [Name-Repl|Extract1]
  612        },
  613        replacement_parts(T1, Extract1)
  614    ;   eos
  615    ->  !,
  616        { add_part(HCodes, Parts, []),
  617          Extract = []
  618        }
  619    ).
  620
  621add_part([], Parts, Parts) :-
  622    !.
  623add_part(Codes, [H|T], T) :-
  624    string_codes(H, Codes).
 string_escape(-Codes)// is nondet
Similar to dcg_basics:string(Codes) but also escapes "$" and "/"
  628string_escape([]) -->
  629    [].
  630string_escape([0'$|T]) -->
  631    "$$", !,
  632    string_escape(T).
  633string_escape([0'\\|T]) -->
  634    "\\\\", !,
  635    string_escape(T).
  636string_escape([H|T]) -->
  637    [H],
  638    string_escape(T).
  639
  640capture_name(Name) -->
  641    "{",
  642    (   digit(D0)
  643    ->  digits(DL),
  644        "}",
  645        { number_codes(Name, [D0|DL]) }
  646    ;   letter(A0),
  647        alnums(AL),
  648        "}",
  649        { atom_codes(Name, [A0|AL]) }
  650    ).
  651capture_name(Name) -->
  652    digit(D0),
  653    !,
  654    digits(DL),
  655    { number_codes(Name, [D0|DL]) }.
  656capture_name(Name) -->
  657    letter(A0),
  658    !,
  659    alnums(AL),
  660    { atom_codes(Name, [A0|AL]) }.
  661
  662letter(L) -->
  663    [L],
  664    { between(0'a,0'z,L)
  665    ; between(0'A,0'Z,L)
  666    ; L == 0'_
  667    }, !.
  668
  669alnums([H|T]) -->
  670    alnum(H),
  671    !,
  672    alnums(T).
  673alnums([]) -->
  674    "".
  675
  676alnum(L) -->
  677    [L],
  678    { between(0'a,0'z,L)
  679    ; between(0'A,0'Z,L)
  680    ; between(0'0,0'9,L)
  681    ; L == 0'_
  682    }, !.
 re_compile(+Pattern, -Regex, +Options) is det
Compiles Pattern to a Regex blob of type regex (see blob/2). Defined Options are given below. Please consult the PCRE2 API documentation for details. If an option is repeated, the first value is used and subsequent values are ignored. Unrecognized options are ignored. Unless otherwise specified, boolean options default to false. Some options may not exist on your system, depending on the PCRE2 version and how it was built - these unsupported options are silently ignored.

The various matching predicates can take either a Regex blob or a string pattern; if they are given a string pattern, they call re_compile/3 and cache the result; so, there is little reason to use re_compile/3 directly.

In addition to the options above that directly map to PCRE flags the following options are processed:

The capture_type specifies the default for this pattern. The interface supports a different type for each named group using the syntax `(?<name_T>...)`, where T is one of S (string), A (atom), I (integer), F (float), N (number), T (term) and R (range). In the current implementation I, F and N are synonyms for T. Future versions may act different if the parsed value is not of the requested numeric type.

Note that re_compile/3 does not support the Pattern/Flags form that is supported by re_match/3, re_replace/4, etc.; the Pattern must be text and all compile options specified in Options.

 re_compiled(+Spec, --Regex, +Options) is det
Create a compiled regex from a specification. Cached compiled regular expressions can be reclaimed using re_flush/0 (which also removes "compiled" With arguments from re_replace/4 and re_replace/5).
  823:- table re_compiled_/4 as shared.  824
  825re_compiled(RegexIn, Regex, Options) :-
  826    (   blob(RegexIn, regex)
  827    ->  Regex = RegexIn
  828    ;   RegexIn = Text/Flags
  829    ->  re_compiled_(Text, Flags, Regex, Options)
  830    ;   re_compiled_(RegexIn, '', Regex, Options)
  831    ).
  832
  833re_compiled_(Text, Flags, Regex, Options) =>
  834    must_be(text, Text),
  835    must_be(atom, Flags),
  836    re_flags_options(Flags, Options0),
  837    append(Options0, Options, Options2),
  838    re_compile(Text, Regex, Options2).
  839
  840re_flags_options(Flags, Options) :-
  841    atom_chars(Flags, Chars),
  842    maplist(re_flag_option, Chars, Options).
  843
  844re_flag_option(Flag, Option) :-
  845    re_flag_option_(Flag, Option),
  846    !.
  847re_flag_option(Flag, _) :-
  848    existence_error(re_flag, Flag).
  849
  850re_flag_option_(i, caseless(true)).
  851re_flag_option_(m, multiline(true)).
  852re_flag_option_(x, extended(true)).
  853re_flag_option_(s, dotall(true)).
  854re_flag_option_(a, capture_type(atom)).
  855re_flag_option_(r, capture_type(range)).
  856re_flag_option_(t, capture_type(term)).
 re_flush
Clean pattern and replacement caches.
To be done
- Flush automatically if the cache becomes too large.
  864re_flush :-
  865    abolish_module_tables(pcre).
 re_config(?Term)
Extract configuration information from the pcre library. Term is of the form Name(Value). Name is derived from the PCRE_CONFIG_* constant after removing PCRE_CONFIG_ and mapping the name to lower case, e.g. utf8, unicode_properties, etc. Value is a Prolog boolean, integer, or atom. For boolean (1 or 0) values, true or false is returned.

re_config/1 will backtrack through all the possible configuration values if its argument is a variable. If an unknown option is specified, re_config/1 fails.

Non-compatible changes between PCRE1 and PCRE2 because numeric values changed: bsr and newline have been replaced by bsr2 and newline2:

Term values are as follows. Some values might not exist, depending on the version of PCRE2 and the options it was built with.

  939%   @see `man pcre2api` for details
  940
  941re_config(Term), var(Term) =>
  942    re_config_choice(Term),
  943    % This code depends on re_config_/1 failing if it's given an invalid
  944    % Term (e.g., re_config_(jittarget(_)) fails if jit(false)). If
  945    % re_config_/1 is changed to throw an error, then the following call
  946    % needs to be inside catch/3.
  947    re_config_(Term).
  948re_config(Term) =>
  949    re_config_(Term).
  950
  951		 /*******************************
  952		 *            COMPILE		*
  953		 *******************************/
  954
  955:- create_prolog_flag(re_compile, false, [keep(true)]).  956
  957expand_regex(Regex, String, Options, Goal) :-
  958    \+ blob(Regex, regex),
  959    re_compiled(Regex, Compiled, Options),
  960    Goal = (pcre:re_match_(Compiled, String, Options)).
  961
  962
  963:- multifile
  964    user:goal_expansion/2.  965
  966user:goal_expansion(re_match(Regex, String), Compiled) :-
  967    ground(Regex),
  968    current_prolog_flag(re_compile, true),
  969    expand_regex(Regex, String, [], Compiled).
  970user:goal_expansion(re_match(Regex, String, Options), Compiled) :-
  971    ground(Regex),
  972    ground(Options),
  973    current_prolog_flag(re_compile, true),
  974    expand_regex(Regex, String, Options, Compiled)