1:- module(pac_regex, [parse_regex/2,
    2		      regex_unix_macro/2]).    3
    4% [2015/02/17]
    5
    6regex_unix_macro($, @( $ )).
    7regex_unix_macro(^, []).
    8
    9% ?- parse_regex("a", X).
   10% ?- parse_regex("a|b", X).
   11% ?- parse_regex("(abc)*", X).
   12% ?- parse_regex("([% \t]|(%@*)", X).
   13% ?- parse_regex("([% \t]|(%@*)|(\\?-))*", X).
   14% ?- parse_regex("[^\n]*$", X).
   15
   16			/*********************
   17			*     parse regex    *
   18			*********************/
   19% %
   20% dcg_d((.), [A|B], B, [A]).
   21% dcg_d([A], [A|B], B, [A]).
   22% dcg_d({G},  A,    A, {G}).
   23% dcg_d(X, A, B, X0):-  pacx:complete_args(X, [A, B], X0).
   24
   25% ?- parse_regex("ab", R).
   26% ?- parse_regex("a\\*",R).
   27%@ R = 'C'([97])+'C'([42]).
   28% ?- parse_regex("a\\\*",R).  % <== intentional Syntex error
   29% ?- parse_regex("a\\\\*",R).
   30% ?- parse_regex("a\\\\\\*",R).
   31% ?- parse_regex("[abc\\]]*",R).
   32% ?- parse_regex("[abc\\\\]]*",R).
   33% ?- parse_regex("a", R).
   34% ?- parse_regex("(a)", R).
   35% ?- parse_regex("(abc)", R).
   36% ?- parse_regex("[abc]", R).
   37% ?- parse_regex("[abc]*",R).
   38% ?- parse_regex("[^abc]", R).
   39% ?- parse_regex("(.*)", R).
   40% ?- parse_regex("(a*)", R).
   41% ?- parse_regex("a*b", R).
   42% ?- parse_regex(".", R).
   43% ?- parse_regex("[^a-zA-Z]",R).
   44% ?- parse_regex("a|b|c", R).
   45% ?- parse_regex("(a|b|c)**", R).
   46% ?- parse_regex("(\\(*[a]|1)",R).
   47% ?- parse_regex("abc",R).
   48% ?- parse_regex("[a]",R).
   49% ?- parse_regex("[ab]",R).
   50% ?- parse_regex("[a-b]",R).
   51% ?- parse_regex(".",R).
   52% ?- parse_regex("\\.",R).
   53% ?- parse_regex("a",R).
   54% ?- parse_regex("[^\n]",R).
   55% ?- parse_regex("$",R).
   58parse_regex(X, Y) :- string_chars(X, X0),
   59	once(parse_regex([], [], [], Y0, X0, [])),
   60	Y0 = [Y].
   61%
   62parse_regex(A, B, A0, B0) --> token(T), !,
   63	{ once(push_pop_stacks(T, A, B, A1, B1)) },
   64	parse_regex(A1, B1, A0, B0).
   65parse_regex(A, B, A0, B0) --> % end of the regex
   66	{ close_block(A, B, A0, B1),
   67	  fold_block_reversely(B1, B0) }.
   68
   69% standard push/pop actions from  operator-precedence grammars
   70push_pop_stacks(')', A, X, B, Y) :-  close_block(A, X, B, Z),	% block close
   71	fold_block_reversely(Z,  Y).
   72push_pop_stacks('(', A, X, ['('|A], ['('|X]).	% block open
   73push_pop_stacks({}(I), A, [T|X],  A, [T^I|X]).	% repeat spec
   74push_pop_stacks(F, A, X, A, Y) :-  unary_opr(F),
   75	apply_unary_opr(F, X, Y).
   76push_pop_stacks(F, A, X, [F|B], Y) :- binary_opr(F),
   77	once(sweep_higher_opr(F, A, X, B, Y)).
   78push_pop_stacks(T, A, X, A, [T|X]).
   79
   80%
   81token('C'([C])) --> [(\)], [C0], {char_code(C0, C)}.  % escape charcter
   82token(X) --> [ A ], { regex_unix_macro(A, X) }.
   83% token(T) --> [T], {memberchk(T, ['(',')', '*', '+', '!', '|', '.', '?'])}.
   84token(T) --> [T], {memberchk(T, ['(',')', '*', '+', '|', '.', '?'])}.
   85token(out(D))	--> ['[', ^],  char_class(D0, []), {chars_interval(D0, D)}.
   86token(dot(D))	--> ['['], char_class(D0, []), {chars_interval(D0, D)}.
   87token({}(I))	--> ['{'], repeat_spec(I).
   88token('C'([C]))	--> [C0], {char_code(C0, C)}.
   89
   90%
   91sweep_higher_opr(_, [], X, [], X).
   92sweep_higher_opr(_, ['('|X], Y, ['('|X], Y).
   93sweep_higher_opr(F, [G|A], X, B, Y):- higher_priority(G, F),
   94	apply_binary_opr(G, X, Z),
   95	sweep_higher_opr(F, A, Z, B, Y).
   96sweep_higher_opr(F, A, X, B, Y):- push_pop_stacks(F, A, X, B, Y).
   97
   98%
   99close_block([], X, [], X).
  100close_block(['('|A], X, A, X).
  101close_block([F|A], X, B, Y):-
  102	apply_binary_opr(F, X, Z),
  103	close_block(A, Z, B, Y).
  104
  105% ?- fold_block_reversely([a, c, '(', b], X).
  106fold_block_reversely([X|Y], [Z|U]):- fold_block_reversely(X, Y, Z, U).
  107fold_block_reversely([], []).
  108
  109%
  110fold_block_reversely(X, [], X, []).
  111fold_block_reversely(X, ['('|Y], X, Y).
  112fold_block_reversely(X, [Y|Z], U, V):- fold_block_reversely(Y+X, Z, U, V).
  113
  114%
  115unary_opr(*).
  116unary_opr(?).
  117% unary_opr(!).
  118unary_opr(+).
  119
  120%
  121binary_opr(&).
  122binary_opr('|').
  123
  124% only for binary operators
  125higher_priority(&, &).
  126higher_priority(&, '|').
  127higher_priority('|', '|').
  128
  129%
  130apply_unary_opr(*, [X|Y], [*(X)|Y]).
  131apply_unary_opr(+, [X|Y], [+(X)|Y]).
  132apply_unary_opr(?, [X|Y], [?(X)|Y]).
  133% apply_unary_opr(!, [X|Y], [!(X)|Y]).
  134
  135%
  136apply_binary_opr('|', [X, Y|Z], ['|'(Y, X)|Z]).
  137apply_binary_opr('&', [X, Y|Z], ['&'(Y, X)|Z]).
  138
  139% ?- repeat_spec(X, ['1', '2', ',', '3', '4', '}'], Y).
  140%@ X = 12-34
  141% ?- repeat_spec(X, ['1', '2', ',','}'], Y).
  142%@ X = >=(12)
  143
  144repeat_spec(X) --> number_chars(J0), [','],   { J0\==[] },
  145	 number_chars(K0), ['}'],
  146	{ 	number_chars(J, J0),
  147		( K0 \== []
  148		->	number_chars(K, K0),
  149			X = J-K
  150		;	X = (>=(J))
  151		)
  152	}.
  153
  154%
  155number_chars([D|Ds]) --> [D], {char_type(D, digit)},
  156	number_chars(Ds).
  157number_chars([]) --> [].
  158
  159%
  160char_class([\(C)|Y], Z)	--> [\], [C], char_class(Y,Z).
  161char_class([A|X], Y)	--> [A, A], char_class(X, Y).  % "idempotent" law
  162char_class(X, X)		--> [']'].
  163char_class([C|Y], Z)	--> [C], char_class(Y,Z).
  164
  165%
  166chars_interval([],[]).
  167chars_interval([X, -, Y|R], [C - D|S]):- drop_escape(X, X0),
  168	drop_escape(Y, Y0),
  169	char_code(X0, C),
  170	char_code(Y0, D),
  171	chars_interval(R, S).
  172chars_interval([X|R], [C|S]):- drop_escape(X, X0),
  173	char_code(X0, C),
  174	chars_interval(R, S).
  175
  176%
  177drop_escape(\(X), X).
  178drop_escape(X, X)