1:- module(
    2  xml_ext,
    3  [
    4    call_on_xml/3,      % +In, +Names, :Goal_1
    5    'Char'//1,          % +Version
    6    'Char'//2,          % +Version, ?Code
    7    load_xml/2,         % +Source, -Dom
    8    xml_encoding/2,     % +In, -Encoding
    9    xml_file_encoding/2 % +File, -Encoding
   10  ]
   11).

Extended support for XML

Extends the support for working with XML provided by the SWI-Prolog standard library.

*/

   20:- use_module(library(apply)).   21:- use_module(library(pure_input)).   22:- use_module(library(sgml)).   23:- use_module(library(yall)).   24
   25:- use_module(library(atom_ext)).   26:- use_module(library(dcg)).   27:- use_module(library(file_ext)).   28:- use_module(library(stream_ext)).   29
   30:- meta_predicate
   31    call_on_xml(+, +, 1).
 call_on_xml(+In:istream, +Names:list(atom), :Goal_1) is det
Call Goal_1 on an XML stream, where the argument supplied to Goal_1 is a subtree that starts with an element called Name.
   42call_on_xml(In, Names, Goal_1) :-
   43  b_setval(xml_stream_record_names, Names),
   44  b_setval(xml_stream_goal, Goal_1),
   45  setup_call_cleanup(
   46    new_sgml_parser(Parser, []),
   47    (
   48      maplist(set_sgml_parser(Parser), [dialect(xml),space(remove)]),
   49      sgml_parse(Parser, [call(begin,on_begin_),source(In)])
   50    ),
   51    free_sgml_parser(Parser)
   52  ).
   53
   54on_begin_(Name, Attr, Parser) :-
   55  b_getval(xml_stream_goal, Goal_1),
   56  b_getval(xml_stream_record_names, Names),
   57  memberchk(Name, Names), !,
   58  sgml_parse(Parser, [document(Dom),parse(content)]),
   59  (   call(Goal_1, [element(Name,Attr,Dom)])
   60  ->  true
   61  ;   print_message(warning, xml_error(element(Name,Attr,Dom)))
   62  ).
 Char(+Version:compound)//
 Char(+Version:compound, ?Code:code)//
An XML Character is an atomic unit of text specified by ISO/IEC 10646.

XML 1.0

Char ::= #x9                // Horizontal tab
       | #xA                // Line feed
       | #xD                // Carriage return
       | [#x20-#xD7FF]      // Space, punctuation, numbers, letters
       | [#xE000-#xFFFD]
       | [#x10000-#x10FFFF]

Avoid comapatibility characters [Unicode, section 2.3]. Avoid the following characters (control characters, permanently undefined Unicode characters):

[#x7F-#x84] // Delete, ...
[#x86-#x9F]
[#xFDD0-#xFDEF],
[#x1FFFE-#x1FFFF]
[#x2FFFE-#x2FFFF]
[#x3FFFE-#x3FFFF]
[#x4FFFE-#x4FFFF]
[#x5FFFE-#x5FFFF]
[#x6FFFE-#x6FFFF]
[#x7FFFE-#x7FFFF]
[#x8FFFE-#x8FFFF]
[#x9FFFE-#x9FFFF]
[#xAFFFE-#xAFFFF]
[#xBFFFE-#xBFFFF]
[#xCFFFE-#xCFFFF]
[#xDFFFE-#xDFFFF]
[#xEFFFE-#xEFFFF]
[#xFFFFE-#xFFFFF]
[#x10FFFE-#x10FFFF]

XML 1.1

Char ::= [#x1-#xD7FF]
       | [#xE000-#xFFFD]
       | [#x10000-#x10FFFF]
/* any Unicode character, excluding the surrogate blocks,
   FFFE, and FFFF. */
Arguments:
Version- is either `version(1,0)' for XML 1.0 or `version(1,1)' for XML 1.1.
  122'Char'(Version) -->
  123  'Char'(Version, _).
  124
  125
  126'Char'(version(1,0), 0x9) --> [0x9].
  127'Char'(version(1,0), 0xA) --> [0xA].
  128'Char'(version(1,0), 0xD) --> [0xD].
  129'Char'(version(1,0), Code) --> dcg_between(0x20, 0xD7FF, Code).
  130'Char'(version(1,0), Code) --> dcg_between(0xE000, 0xFFFD, Code).
  131'Char'(version(1,0), Code) --> dcg_between(0x10000, 0x10FFFF, Code).
  132'Char'(version(1,1), Code) --> dcg_between(0x1, 0xD7FF, Code).
  133'Char'(version(1,1), Code) --> dcg_between(0xE000, 0xFFFD, Code).
  134'Char'(version(1,1), Code) --> dcg_between(0x10000, 0x10FFFF, Code).
 load_xml(+Source, -Dom:list(compound)) is det
  140load_xml(Source, Dom) :-
  141  load_xml(Source, Dom, [space(remove)]).
 xml_encoding(+In:istream, -Encoding:atom) is semidet
  147xml_encoding(In, Encoding) :-
  148  phrase_from_stream(xml_encoding(Encoding0), In),
  149  nonvar(Encoding0),
  150  stream_ext:clean_encoding_(Encoding0, Encoding).
  151
  152xml_encoding(Encoding) -->
  153  'XMLDecl'(_,Encoding,_),
  154  remainder(_).
 xml_file_encoding(+File:atom, -Encoding:atom) is semidet
  160xml_file_encoding(File, Encoding) :-
  161  read_from_file(File, {Encoding}/[In0]>>xml_encoding(In0, Encoding)).
  162
  163
  164
  165
  166
  167% GRAMMAR %
 EncName(-Encoding:atom)// is det
EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*

compat XML 1.0.5 [81] compat XML 1.1.2 [81]

  178'EncName'(Encoding) -->
  179  alpha(H),
  180  'enc_name_char*'(T),
  181  {atom_codes(Encoding, [H|T])}.
  182
  183'enc_name_char*'([H|T]) -->
  184  enc_name_char(H), !,
  185  'enc_name_char*'(T).
  186'enc_name_char*'([]) --> "".
  187
  188enc_name_char(Code) --> alphanum(Code).
  189enc_name_char(0'.) --> ".".
  190enc_name_char(0'_) --> "_".
  191enc_name_char(0'-) --> "-".
 EncodingDecl(-Encoding:atom)//
EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
Compatibility
- XML 1.0.5 [80]
- XML 1.1.2 [80]
  204'EncodingDecl'(Encoding) -->
  205  'S+',
  206  "encoding",
  207  'Eq',
  208  (   "\""
  209  ->  'EncName'(Encoding),
  210      must_see_code(0'")%"
  211  ;   "'"
  212  ->  'EncName'(Encoding),
  213      must_see_code(0'')
  214  ).
 Eq// is det
Eq ::= S? '=' S?
Compatibility
- XML 1.0.5 [25].
- XML 1.1.2 [25].
  227'Eq' -->
  228  'S*',
  229  "=",
  230  'S*'.
 S// is det
Greedy white space.
S ::= ( #x20 | #x9 | #xD | #xA )+ // Any consecutive number of spaces,
                                  // carriage returns, line feeds, and
                                  // horizontal tabs.

The presence of carriage_return// in the above production is maintained purely for backward compatibility with the First Edition. All `#xD` characters literally present in an XML document are either removed or replaced by line_feed// (i.e., `#xA`) characters before any other processing is done.

  250'S' --> [0x20].
  251'S' --> [0x9].
  252'S' --> [0xD].
  253'S' --> [0xA].
  254
  255'S+' -->
  256  'S',
  257  'S*'.
  258
  259'S*' -->
  260  'S', !,
  261  'S*'.
  262'S*' --> "".
 SDDecl(-Standalone:boolean)// is det
Standalone Declaration
SDDecl ::= S 'standalone' Eq
           (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
Compatibility
- XML 1.0.5 [32].
- XML 1.1.2 [32].
To be done
- [VC: Standalone Document Declaration]
  279'SDDecl'(Standalone) -->
  280  'S+',
  281  "standalone",
  282  'Eq',
  283  (   "'"
  284  ->  yesno(Standalone),
  285      must_see_code(0'')
  286  ;   "\""
  287  ->  yesno(Standalone),
  288      must_see_code(0'")%"
  289  ).
  290
  291yesno(true) -->"yes".
  292yesno(false) --> "no".
 XMLDecl(-Version:compound, -Encoding:atom, -Standalone:boolean)// is det
XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
Compatibility
- XML 1.0.5 [23].
- XML 1.1.2 [23].
  305'XMLDecl'(Version, Encoding, Standalone) -->
  306  "<?xml",
  307  'VersionInfo'(Version),
  308  ('EncodingDecl'(Encoding) -> "" ; ""),
  309  ('SDDecl'(Standalone) -> "" ; ""),
  310  'S*',
  311  "?>".
 VersionInfo(-Version:compound)// is det
VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
Compatibility
- XML 1.0.5 [24].
- XML 1.1.2 [24].
  324'VersionInfo'(Version) -->
  325  'S+',
  326  "version",
  327  'Eq',
  328  (   "'"
  329  ->  'VersionNum'(Version),
  330      "'"
  331  ;   "\""
  332  ->  'VersionNum'(Version),
  333      "\""
  334  ).
 VersionNum(-Version:compound)// is det

XML 1.0

VersionNum ::= '1.' [0-9]+

XML 1.1

VersionNum ::= '1.1'
Arguments:
Version- is a compound term of the form `version(Major:nonneg,Minor:nonneg)'.
Compatibility
- XML 1.0.5 [26].
- XML 1.1.2 [26].
  358'VersionNum'(version(1,Minor)) -->
  359  "1.",
  360  integer(Minor).
  361'VersionNum'(version(1,1)) -->
  362  "1.1".
  363
  364
  365
  366
  367
  368% HELPERS %
 must_see_code(+Code:code)// is det
  372must_see_code(Code) -->
  373  must_see_code(Code, 'S*')