View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2023, VU University Amsterdam
    7			      SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(uri,
   37          [ uri_components/2,           % ?URI, ?Components
   38            uri_data/3,                 % ?Field, +Components, ?Data
   39            uri_data/4,                 % +Field, +Components, -Data, -New
   40	    uri_edit/3,			% +Actions,+URI0,-URI
   41
   42            uri_normalized/2,           % +URI, -NormalizedURI
   43            iri_normalized/2,           % +IRI, -NormalizedIRI
   44            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   45            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   46            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   47            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   48            uri_resolve/3,              % +URI, +Base, -AbsURI
   49            uri_is_global/1,            % +URI
   50            uri_query_components/2,     % ?QueryString, ?NameValueList
   51            uri_authority_components/2, % ?Authority, ?Components
   52            uri_authority_data/3,       % ?Field, ?Components, ?Data
   53					% Encoding
   54            uri_encoded/3,              % +Component, ?Value, ?Encoded
   55            uri_file_name/2,            % ?URI, ?Path
   56            uri_iri/2                   % ?URI, ?IRI
   57	  ]).   58:- autoload(library(error), [domain_error/2]).   59:- use_foreign_library(foreign(uri)).   60
   61/** <module> Process URIs
   62
   63This  library  provides   high-performance    C-based   primitives   for
   64manipulating URIs. We decided for a  C-based implementation for the much
   65better performance on raw character  manipulation. Notably, URI handling
   66primitives are used in  time-critical  parts   of  RDF  processing. This
   67implementation is based on RFC-3986:
   68
   69        http://labs.apache.org/webarch/uri/rfc/rfc3986.html
   70
   71The URI processing in this library is  rather liberal. That is, we break
   72URIs according to the rules, but we  do not validate that the components
   73are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
   74UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
   75
   76Earlier experience has shown that strict   enforcement of the URI syntax
   77results in many errors that  are   accepted  by  many other web-document
   78processing tools.
   79*/
   80
   81%!  uri_components(+URI, -Components) is det.
   82%!  uri_components(-URI, +Components) is det.
   83%
   84%   Break a URI  into  its  5   basic  components  according  to the
   85%   RFC-3986 regular expression:
   86%
   87%       ==
   88%       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
   89%        12            3  4          5       6  7        8 9
   90%       ==
   91%
   92%   @param Components is a   term  uri_components(Scheme, Authority,
   93%   Path, Search, Fragment). If a URI  is *parsed*, i.e., using mode
   94%   (+,-), components that are not   found are left _uninstantiated_
   95%   (variable). See uri_data/3 for accessing this structure.
   96
   97%!  uri_data(?Field, +Components, ?Data) is semidet.
   98%
   99%   Provide access the uri_component structure.  Defined field-names
  100%   are: =scheme=, =authority=, =path=, =search= and =fragment=
  101
  102uri_data(scheme,    uri_components(S, _, _, _, _), S).
  103uri_data(authority, uri_components(_, A, _, _, _), A).
  104uri_data(path,      uri_components(_, _, P, _, _), P).
  105uri_data(search,    uri_components(_, _, _, S, _), S).
  106uri_data(fragment,  uri_components(_, _, _, _, F), F).
  107
  108%!  uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
  109%
  110%   NewComponents is the same as Components with Field set to Data.
  111
  112uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  113                    uri_components(S, A, P, Q, F)).
  114uri_data(authority, uri_components(S, _, P, Q, F), A,
  115                    uri_components(S, A, P, Q, F)).
  116uri_data(path,      uri_components(S, A, _, Q, F), P,
  117                    uri_components(S, A, P, Q, F)).
  118uri_data(search,    uri_components(S, A, P, _, F), Q,
  119                    uri_components(S, A, P, Q, F)).
  120uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  121                    uri_components(S, A, P, Q, F)).
  122
  123%!  uri_normalized(+URI, -NormalizedURI) is det.
  124%
  125%   NormalizedURI is the normalized form   of  URI. Normalization is
  126%   syntactic and involves the following steps:
  127%
  128%       * 6.2.2.1. Case Normalization
  129%       * 6.2.2.2. Percent-Encoding Normalization
  130%       * 6.2.2.3. Path Segment Normalization
  131
  132%!  iri_normalized(+IRI, -NormalizedIRI) is det.
  133%
  134%   NormalizedIRI is the normalized form   of  IRI. Normalization is
  135%   syntactic and involves the following steps:
  136%
  137%       * 6.2.2.1. Case Normalization
  138%       * 6.2.2.3. Path Segment Normalization
  139%
  140%   @see    This is similar to uri_normalized/2, but does not do
  141%           normalization of %-escapes.
  142
  143%!  uri_normalized_iri(+URI, -NormalizedIRI) is det.
  144%
  145%   As uri_normalized/2, but percent-encoding is translated into IRI
  146%   Unicode characters. The translation  is   liberal:  valid  UTF-8
  147%   sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
  148%   character. Other %XX-sequences are mapped   to the corresponding
  149%   ISO-Latin-1 character and sole % characters are left untouched.
  150%
  151%   @see uri_iri/2.
  152
  153
  154%!  uri_is_global(+URI) is semidet.
  155%
  156%   True if URI has a scheme. The  semantics   is  the  same as the code
  157%   below, but the implementation is more efficient  as it does not need
  158%   to parse the other components, nor  needs   to  bind the scheme. The
  159%   condition to demand a scheme of more  than one character is added to
  160%   avoid confusion with DOS path names.
  161%
  162%   ==
  163%   uri_is_global(URI) :-
  164%           uri_components(URI, Components),
  165%           uri_data(scheme, Components, Scheme),
  166%           nonvar(Scheme),
  167%           atom_length(Scheme, Len),
  168%           Len > 1.
  169%   ==
  170
  171%!  uri_resolve(+URI, +Base, -GlobalURI) is det.
  172%
  173%   Resolve a possibly local URI relative   to Base. This implements
  174%   http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
  175
  176%!  uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
  177%
  178%   NormalizedGlobalURI is the normalized global version of URI.
  179%   Behaves as if defined by:
  180%
  181%   ==
  182%   uri_normalized(URI, Base, NormalizedGlobalURI) :-
  183%           uri_resolve(URI, Base, GlobalURI),
  184%           uri_normalized(GlobalURI, NormalizedGlobalURI).
  185%   ==
  186
  187%!  iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det.
  188%
  189%   NormalizedGlobalIRI is the normalized  global   version  of IRI.
  190%   This is similar to uri_normalized/3, but   does  not do %-escape
  191%   normalization.
  192
  193%!  uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
  194%
  195%   NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
  196%   as if defined by:
  197%
  198%   ==
  199%   uri_normalized(URI, Base, NormalizedGlobalIRI) :-
  200%           uri_resolve(URI, Base, GlobalURI),
  201%           uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
  202%   ==
  203
  204%!  uri_query_components(+String, -Query) is det.
  205%!  uri_query_components(-String, +Query) is det.
  206%
  207%   Perform encoding and decoding of an URI query string. Query is a
  208%   list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
  209%   query elements of the forms Name(Value)  and Name-Value are also
  210%   accepted to enhance interoperability with   the option and pairs
  211%   libraries.  E.g.
  212%
  213%   ==
  214%   ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
  215%   QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
  216%
  217%   ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
  218%   Q = [a=b, c='d+w', n='VU Amsterdam'].
  219%   ==
  220
  221
  222%!  uri_authority_components(+Authority, -Components) is det.
  223%!  uri_authority_components(-Authority, +Components) is det.
  224%
  225%   Break-down the  authority component of  a URI.  The fields  of the
  226%   structure Components  can be accessed  using uri_authority_data/3.
  227%   This  predicate deals  with  IPv6 addresses  written as  ``[ip]``,
  228%   returning the  _ip_ as `host`,  without the enclosing  `[]`.  When
  229%   constructing an  authority string and  the host contains  `:`, the
  230%   host is  embraced in  `[]`.  If  `[]` is  not used  correctly, the
  231%   behavior  should be  considered poorly  defined.  If  there is  no
  232%   balancing  `]` or  the  host part  does not  end  with `]`,  these
  233%   characters  are  considered  normal  characters and  part  of  the
  234%   (invalid) host name.
  235
  236
  237%!  uri_authority_data(+Field, ?Components, ?Data) is semidet.
  238%
  239%   Provide access the uri_authority  structure. Defined field-names
  240%   are: =user=, =password=, =host= and =port=
  241
  242uri_authority_data(user,     uri_authority(U, _, _, _), U).
  243uri_authority_data(password, uri_authority(_, P, _, _), P).
  244uri_authority_data(host,     uri_authority(_, _, H, _), H).
  245uri_authority_data(port,     uri_authority(_, _, _, P), P).
  246
  247
  248%!  uri_encoded(+Component, +Value, -Encoded) is det.
  249%!  uri_encoded(+Component, -Value, +Encoded) is det.
  250%
  251%   Encoded   is   the   URI   encoding   for   Value.   When   encoding
  252%   (Value->Encoded), Component specifies the URI   component  where the
  253%   value is used. It is  one   of  =query_value=, =fragment=, =path= or
  254%   =segment=.  Besides  alphanumerical   characters,    the   following
  255%   characters are passed verbatim (the set   is split in logical groups
  256%   according to RFC3986).
  257%
  258%       $ query_value, fragment :
  259%       "-._~" | "!$'()*,;" | "@" | "/?"
  260%       $ path :
  261%       "-._~" | "!$&'()*,;=" | "@" | "/"
  262%       $ segment :
  263%       "-._~" | "!$&'()*,;=" | "@"
  264
  265%!  uri_iri(+URI, -IRI) is det.
  266%!  uri_iri(-URI, +IRI) is det.
  267%
  268%   Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
  269%   a fully expanded  Unicode  string.   Unicode  strings  are first
  270%   encoded into UTF-8, after which %-encoding takes place.
  271%
  272%   @error syntax_error(Culprit) in mode (+,-) if URI is not a
  273%   legally percent-encoded UTF-8 string.
  274
  275
  276%!  uri_file_name(+URI, -FileName) is semidet.
  277%!  uri_file_name(-URI, +FileName) is det.
  278%
  279%   Convert between a URI and a   local  file_name. This protocol is
  280%   covered by RFC 1738. Please note   that file-URIs use _absolute_
  281%   paths. The mode (-, +) translates  a possible relative path into
  282%   an absolute one.
  283
  284uri_file_name(URI, FileName) :-
  285    nonvar(URI),
  286    !,
  287    uri_components(URI, Components),
  288    uri_data(scheme, Components, File), File == file,
  289    (   uri_data(authority, Components, '')
  290    ->  true
  291    ;   uri_data(authority, Components, localhost)
  292    ),
  293    uri_data(path, Components, FileNameEnc),
  294    uri_encoded(path, FileName0, FileNameEnc),
  295    delete_leading_slash(FileName0, FileName).
  296uri_file_name(URI, FileName) :-
  297    nonvar(FileName),
  298    !,
  299    absolute_file_name(FileName, Path0),
  300    ensure_leading_slash(Path0, Path),
  301    uri_encoded(path, Path, PathEnc),
  302    uri_data(scheme, Components, file),
  303    uri_data(authority, Components, ''),
  304    uri_data(path, Components, PathEnc),
  305    uri_components(URI, Components).
  306
  307%!  ensure_leading_slash(+WinPath, -Path).
  308%!  delete_leading_slash(+Path, -WinPath).
  309%
  310%   Deal with the fact that absolute paths   in Windows start with a
  311%   drive letter rather than a  /.  For   URIs  we  need a path that
  312%   starts with a /.
  313
  314ensure_leading_slash(Path, SlashPath) :-
  315    (   sub_atom(Path, 0, _, _, /)
  316    ->  SlashPath = Path
  317    ;   atom_concat(/, Path, SlashPath)
  318    ).
  319
  320:- if(current_prolog_flag(windows, true)).  321delete_leading_slash(Path, WinPath) :-
  322    atom_concat(/, WinPath, Path),
  323    is_absolute_file_name(WinPath),
  324    !.
  325:- endif.  326delete_leading_slash(Path, Path).
  327
  328
  329		 /*******************************
  330		 *          MODIFYING           *
  331		 *******************************/
  332
  333%!  uri_edit(+Actions, +URI0, -URI) is det.
  334%
  335%   Modify a  URI according  to Actions.  Actions  is either  a single
  336%   action or a  (nested) list of actions.   Defined primitive actions
  337%   are:
  338%
  339%     - scheme(+Scheme)
  340%       Set the Scheme of the URI (typically `http`, `https`, etc.)
  341%     - user(+User)
  342%       Add/set the user of the authority component.
  343%     - password(+Password)
  344%       Add/set the password of the authority component.
  345%     - host(+Host)
  346%       Add/set the host (or ip address) of the authority component.
  347%     - port(+Port)
  348%       Add/set the port of the authority component.
  349%     - path(+Path)
  350%       Set/extend the `path` component.  If Path is not absolute it
  351%       is taken relative to the path of URI0.
  352%     - search(+KeyValues)
  353%       Extend the `Key=Value` pairs of the current search (query)
  354%       component.   New values replace existing values.  If KeyValues
  355%       is written as =(KeyValues) the current search component is
  356%       ignored.  KeyValues is a list, whose elements are one of
  357%       `Key=Value`, `Key-Value` or `Key(Value)`.
  358%     - fragment(+Fragment)
  359%       Set the Fragment of the uri.
  360%
  361%   Components can be  _removed_ by using a variable  as value, except
  362%   from `path` which  can be reset using path(/) and  query which can
  363%   be dropped using query(=([])).
  364%
  365%   @arg URI0 is either a valid uri or a variable to start fresh.
  366
  367uri_edit(Actions, URI0, URI) :-
  368    (   var(URI0)
  369    ->  URI1 = '/'
  370    ;   URI1 = URI0
  371    ),
  372    uri_components(URI1, Comp0),
  373    edit_components(Actions, Comp0, Comp),
  374    uri_components(URI, Comp).
  375
  376edit_components([], Comp0, Comp) =>
  377    Comp = Comp0.
  378edit_components([H|T], Comp0, Comp) =>
  379    edit_components(H, Comp0, Comp1),
  380    edit_components(T, Comp1, Comp).
  381edit_components(scheme(Scheme), Comp0, Comp) =>
  382    uri_data(scheme, Comp0, Scheme, Comp).
  383edit_components(path(Path), Comp0, Comp) =>
  384    uri_data(path, Comp0, Path0),
  385    (   (   var(Path0)
  386        ;   Path0 == ''
  387        )
  388    ->  Path1 = '/'
  389    ;   Path1 = Path0
  390    ),
  391    uri_normalized(Path, Path1, Path2),
  392    uri_data(path, Comp0, Path2, Comp).
  393edit_components(fragment(Fragment), Comp0, Comp) =>
  394    uri_data(fragment, Comp0, Fragment, Comp).
  395edit_components(Authority, Comp0, Comp),
  396  authority_field(Authority) =>
  397    uri_data(authority, Comp0, Auth0),
  398    (   var(Auth0)
  399    ->  true
  400    ;   uri_authority_components(Auth0, AComp0)
  401    ),
  402    edit_auth_components(Authority, AComp0, AComp),
  403    uri_authority_components(Auth, AComp),
  404    uri_data(authority, Comp0, Auth, Comp).
  405edit_components(query(Search), Comp0, Comp) =>
  406    edit_components(search(Search), Comp0, Comp).
  407edit_components(search(=(Search)), Comp0, Comp) =>
  408    uri_query_components(String, Search),
  409    uri_data(search, Comp0, String, Comp).
  410edit_components(search(Search), Comp0, Comp) =>
  411    uri_data(search, Comp0, SS0),
  412    (   var(SS0)
  413    ->  Search0 = []
  414    ;   uri_query_components(SS0, Search0)
  415    ),
  416    join_search(Search0, Search, Search1),
  417    uri_query_components(SS1, Search1),
  418    uri_data(search, Comp0, SS1, Comp).
  419edit_components(Other, _, _) =>
  420    domain_error(uri_edit, Other).
  421
  422authority_field(user(_)).
  423authority_field(password(_)).
  424authority_field(host(_)).
  425authority_field(port(_)).
  426
  427edit_auth_components(user(User),
  428		     uri_authority(_, Passwd, Host, Port),
  429		     uri_authority(User, Passwd, Host, Port)).
  430edit_auth_components(password(Passwd),
  431		     uri_authority(User, _, Host, Port),
  432		     uri_authority(User, Passwd, Host, Port)).
  433edit_auth_components(host(Host),
  434		     uri_authority(User, Passwd, _, Port),
  435		     uri_authority(User, Passwd, Host, Port)).
  436edit_auth_components(port(Port),
  437		     uri_authority(User, Passwd, Host, _),
  438		     uri_authority(User, Passwd, Host, Port)).
  439
  440join_search([], Search, Search).
  441join_search([N=_|ST], New, Search) :-
  442    (   memberchk(N=_, New)
  443    ->  true
  444    ;   functor(T, N, 1),
  445	memberchk(T, New)
  446    ->  true
  447    ;   memberchk(N-_, New)
  448    ),
  449    !,
  450    join_search(ST, New, Search).
  451join_search([H|ST], New, [H|Search]) :-
  452    join_search(ST, New, Search).
  453
  454
  455                 /*******************************
  456                 *            SANDBOX           *
  457                 *******************************/
  458
  459:- multifile sandbox:safe_primitive/1.  460
  461sandbox:safe_primitive(uri:uri_components(_,_)).
  462sandbox:safe_primitive(uri:uri_normalized(_,_)).
  463sandbox:safe_primitive(uri:iri_normalized(_,_)).
  464sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  465sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  466sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  467sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  468sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  469sandbox:safe_primitive(uri:uri_is_global(_)).
  470sandbox:safe_primitive(uri:uri_query_components(_,_)).
  471sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  472sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  473sandbox:safe_primitive(uri:uri_iri(_,_))