1:- module(regex_parser, [re//2]).    2:- use_module(library(dcg/basics), [integer//1, string//1]).    3:- use_module(library(regex/state), [adjust_case/3]).    4
    5:- set_prolog_flag(double_quotes, string).    6
    7% DCG parser for regular expressions
    8re(Opt, Z) -->
    9    basic_re(Opt,W),
   10    re_tail(Opt,W,Z).
   11
   12
   13re_tail(Opt, W, Z) -->
   14    "|",
   15    basic_re(Opt,X),
   16    re_tail(Opt,union(W,X), Z).
   17re_tail(_Opt, W, W) -->
   18    { true }.
   19
   20
   21basic_re(Opt, Z) -->
   22    simple_re(Opt,W),
   23    basic_re_tail(Opt,W,Z).
   24
   25basic_re_tail(Opt, W, Z) -->
   26    simple_re(Opt,X),
   27    basic_re_tail(Opt,conc(W,X), Z).
   28basic_re_tail(_Opt, W, W) -->
   29    { true }.
   30
   31
   32simple_re(Opt, Z) -->
   33    elemental_re(Opt,W),
   34    simple_re_tail(Opt,W,Z).
   35
   36simple_re_tail(_Opt, W, count(W,0,999_999_999)) -->
   37    "*".
   38simple_re_tail(_Opt, W, count(W,1,999_999_999)) -->
   39    "+".
   40simple_re_tail(_Opt, W, count(W,0,1)) -->
   41    "?".
   42simple_re_tail(_Opt, W, count(W,N,N)) -->
   43    % {n}
   44    "{",
   45    integer(N),
   46    { N >= 0 },
   47    "}".
   48simple_re_tail(_Opt, W, count(W,N,999_999_999)) -->
   49    % {n,}
   50    "{",
   51    integer(N),
   52    { N >= 0 },
   53    ",",
   54    "}".
   55simple_re_tail(_Opt, W, count(W,N,M)) -->
   56    % {n,m}
   57    "{",
   58    integer(N),
   59    { N >= 0 },
   60    ",",
   61    integer(M),
   62    { M >= N },
   63    "}".
   64simple_re_tail(_Opt, W, W) -->
   65    { true }.
   66
   67
   68elemental_re(_Opt, any) -->
   69    ".".
   70%elemental_re(_Opt, caret) -->
   71%    "^".
   72elemental_re(Opt, group(X)) -->
   73    "(",
   74    re(Opt, X),
   75    ")".
   76elemental_re(Opt, named_group(Name, X)) -->
   77    "(?<",
   78    string(NameCodes),
   79    { atom_codes(Name, NameCodes) },
   80    ">",
   81    re(Opt, X),
   82    ")".
   83elemental_re(_Opt, eos) -->
   84    "$".
   85elemental_re(State, char(C)) -->
   86    [C0],
   87    { \+ re_metachar(C0) },
   88    { adjust_case(State, C0, C) }.
   89elemental_re(Opt, RE) -->
   90    "\\",
   91    [C],
   92    { perl_character_class(C, Opt, RE) }.
   93elemental_re(_Opt, char(C)) -->
   94    "\\",
   95    [C],
   96    { re_metachar(C) }.
   97elemental_re(Opt, neg_set(X)) -->
   98    "[^",
   99    !,  % don't backtrack into pos_set/1 clause below
  100    set_items(Opt,X),
  101    "]".
  102elemental_re(Opt, pos_set([char(0'-)|X])) -->
  103    "[-",
  104    !,  % don't backtrack into pos_set/1 clause below
  105    set_items(Opt,X),
  106    "]".
  107elemental_re(Opt, pos_set(X)) -->
  108    "[",
  109    set_items(Opt,X),
  110    "]".
  111elemental_re(Opt, pos_set([char(0'-)|X])) -->
  112    "[",
  113    set_items(Opt,X),
  114    "-]".
  115
  116
  117% true if argument is a code for a regular expression meta character
  118re_metachar(0'^).
  119re_metachar(0'\\).
  120re_metachar(0'|).
  121re_metachar(0'*).
  122re_metachar(0'+).
  123re_metachar(0'.).
  124re_metachar(0'?).
  125re_metachar(0'[).
  126re_metachar(0'$).
  127re_metachar(0'().
  128re_metachar(0')).
  129
  130
  131% define Perl character classes as character sets
  132perl_character_class(0'd, Opt, pos_set(X)) :-
  133    string_codes("0-9", Codes),
  134    set_items(Opt, X,Codes,[]).
  135perl_character_class(0'w, Opt, pos_set(X)) :-
  136    string_codes("0-9A-Za-z_", Codes),
  137    set_items(Opt, X,Codes,[]).
  138perl_character_class(0's, _Opt, pos_set([ char(0'\t)  % tab
  139                                  , char(0'\n)  % newline
  140                                  , char(0'\f)  % form feed
  141                                  , char(0'\r)  % carriage return
  142                                  , char(0' )   % space
  143                                  ])).
  144perl_character_class(Upper, Opt, neg_set(Set)) :-
  145    code_type(Lower, lower(Upper)),
  146    perl_character_class(Lower, Opt, pos_set(Set)).
  147
  148
  149set_items(Opt, [Item1|MoreItems]) -->
  150    set_item(Opt, Item1),
  151    set_items(Opt, MoreItems).
  152set_items(Opt, [Item1]) -->
  153    set_item(Opt, Item1).
  154
  155set_item(State, char(C)) -->
  156    [C0],
  157    { \+ set_metachar(C0) },
  158    { adjust_case(State,C0,C) }.
  159set_item(_Opt, char(C)) -->
  160    "\\",
  161    [C],
  162    { set_metachar(C) }.
  163set_item(Opt, range(A,B)) -->
  164    set_item(Opt, char(A)),
  165    "-",
  166    set_item(Opt, char(B)).
  167
  168
  169set_metachar(0'\\).
  170set_metachar(0']).
  171set_metachar(0'-)