View source with raw comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2025, VU University Amsterdam
    7			      SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(uri,
   37          [ uri_components/2,           % ?URI, ?Components
   38            uri_data/3,                 % ?Field, +Components, ?Data
   39            uri_data/4,                 % +Field, +Components, -Data, -New
   40	    uri_edit/3,			% +Actions,+URI0,-URI
   41
   42            uri_normalized/2,           % +URI, -NormalizedURI
   43            iri_normalized/2,           % +IRI, -NormalizedIRI
   44            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   45            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   46            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   47            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   48            uri_resolve/3,              % +URI, +Base, -AbsURI
   49            uri_is_global/1,            % +URI
   50            uri_query_components/2,     % ?QueryString, ?NameValueList
   51            uri_authority_components/2, % ?Authority, ?Components
   52            uri_authority_data/3,       % ?Field, ?Components, ?Data
   53					% Encoding
   54            uri_encoded/3,              % +Component, ?Value, ?Encoded
   55            uri_file_name/2,            % ?URI, ?Path
   56            uri_iri/2                   % ?URI, ?IRI
   57	  ]).   58:- autoload(library(error), [domain_error/2]).   59:- if(exists_source(library(socket))).   60:- autoload(library(socket), [gethostname/1]).   61:- endif.   62
   63:- use_foreign_library(foreign(uri)).

Process URIs

This library provides high-performance C-based primitives for manipulating URIs. We decided for a C-based implementation for the much better performance on raw character manipulation. Notably, URI handling primitives are used in time-critical parts of RDF processing. This implementation is based on RFC-3986:

http://labs.apache.org/webarch/uri/rfc/rfc3986.html

The URI processing in this library is rather liberal. That is, we break URIs according to the rules, but we do not validate that the components are valid. Also, percent-decoding for IRIs is liberal. It first tries UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.

Earlier experience has shown that strict enforcement of the URI syntax results in many errors that are accepted by many other web-document processing tools.

This library provides explicit support for URN URIs. */

 uri_components(+URI, -Components) is det
uri_components(-URI, +Components) is det
Break a URI into its 5 basic components according to the RFC-3986 regular expression:
^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 12            3  4          5       6  7        8 9

If the schema is urn, it is broken into its schema, NSI (Namespace Identifier) and NSS (Namespace Specific String).

Arguments:
Components- is a one of
uri_components(Scheme, Authority, Path, Search, Fragment)
If a URI is parsed, i.e., using mode (+,-), components that are not found are left uninstantiated (variable). See uri_data/3 for accessing this structure.
urn_components(Scheme, NID, NSS, Search, Fragment)
Here Scheme is always urn. Otherwise the same comments as for uri_components/5 apply.
 uri_data(+Field, +Components, -Data) is semidet
uri_data(-Field, +Components, -Data) is nondet
Provide access the uri_components or urn_components structure. The Field scheme is always present. Other fields depend on the scheme. The urn scheme provides nid and nss. Other schems provide authority, path, search and fragment
  118uri_data(Field, Components, Data), var(Field) =>
  119    uri_data_(Field, Components, Data).
  120uri_data(Field, Components, Data), atom(Field) =>
  121    uri_data_(Field, Components, Data),
  122    !.
  123
  124uri_data_(scheme,    uri_components(S, _, _, _, _), S).
  125uri_data_(authority, uri_components(_, A, _, _, _), A).
  126uri_data_(path,      uri_components(_, _, P, _, _), P).
  127uri_data_(search,    uri_components(_, _, _, S, _), S).
  128uri_data_(fragment,  uri_components(_, _, _, _, F), F).
  129uri_data_(scheme,    urn_components(S, _, _, _, _), S).
  130uri_data_(nid,       urn_components(_, I, _, _, _), I).
  131uri_data_(nss,       urn_components(_, _, N, _, _), N).
  132uri_data_(search,    urn_components(_, _, _, S, _), S).
  133uri_data_(fragment,  urn_components(_, _, _, _, F), F).
 uri_data(+Field, +Components, +Data, -NewComponents) is det
NewComponents is the same as Components with Field set to Data.
Errors
- domain_error(uri_field, Field) if Field is invalid.
- instantiation_error if Field or Components is unbound.
  142uri_data(scheme,    uri_components(_, A, P, Q, F), S, New) =>
  143    New = uri_components(S, A, P, Q, F).
  144uri_data(scheme,    urn_components(_, I, N), S, New) =>
  145    New = urn_components(S, I, N).
  146uri_data(authority, uri_components(S, _, P, Q, F), A, New) =>
  147    New = uri_components(S, A, P, Q, F).
  148uri_data(path,      uri_components(S, A, _, Q, F), P, New) =>
  149    New = uri_components(S, A, P, Q, F).
  150uri_data(search,    uri_components(S, A, P, _, F), Q, New) =>
  151    New = uri_components(S, A, P, Q, F).
  152uri_data(search,    urn_components(S, A, P, _, F), Q, New) =>
  153    New = urn_components(S, A, P, Q, F).
  154uri_data(fragment,  uri_components(S, A, P, Q, _), F, New) =>
  155    New = uri_components(S, A, P, Q, F).
  156uri_data(fragment,  urn_components(S, A, P, Q, _), F, New) =>
  157    New = urn_components(S, A, P, Q, F).
  158uri_data(nid,       urn_components(S, _, N), I, New) =>
  159    New = urn_components(S, I, N).
  160uri_data(nss,       urn_components(S, I, _), N, New) =>
  161    New = urn_components(S, I, N).
  162uri_data(_,         Components, _N, _New), var(Components) =>
  163    instantiation_error(Components).
  164uri_data(Field,     _, _N, _New) =>
  165    domain_error(uri_field, Field).
 uri_normalized(+URI, -NormalizedURI) is det
NormalizedURI is the normalized form of URI. Normalization is syntactic and involves the following steps:
 iri_normalized(+IRI, -NormalizedIRI) is det
NormalizedIRI is the normalized form of IRI. Normalization is syntactic and involves the following steps:
See also
- This is similar to uri_normalized/2, but does not do normalization of %-escapes.
 uri_normalized_iri(+URI, -NormalizedIRI) is det
As uri_normalized/2, but percent-encoding is translated into IRI Unicode characters. The translation is liberal: valid UTF-8 sequences of %-encoded bytes are mapped to the Unicode character. Other %XX-sequences are mapped to the corresponding ISO-Latin-1 character and sole % characters are left untouched.
See also
- uri_iri/2.
 uri_is_global(+URI) is semidet
True if URI has a scheme. The semantics is the same as the code below, but the implementation is more efficient as it does not need to parse the other components, nor needs to bind the scheme. The condition to demand a scheme of more than one character is added to avoid confusion with DOS path names.
uri_is_global(URI) :-
        uri_components(URI, Components),
        uri_data(scheme, Components, Scheme),
        nonvar(Scheme),
        atom_length(Scheme, Len),
        Len > 1.
 uri_resolve(+URI, +Base, -GlobalURI) is det
Resolve a possibly local URI relative to Base. This implements http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det
NormalizedGlobalURI is the normalized global version of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalURI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized(GlobalURI, NormalizedGlobalURI).
 iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global version of IRI. This is similar to uri_normalized/3, but does not do %-escape normalization.
 uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global IRI of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalIRI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 uri_query_components(+String, -Query) is det
uri_query_components(-String, +Query) is det
Perform encoding and decoding of an URI query string. Query is a list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), query elements of the forms Name(Value) and Name-Value are also accepted to enhance interoperability with the option and pairs libraries. E.g.
?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.

?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
Q = [a=b, c='d+w', n='VU Amsterdam'].
 uri_authority_components(+Authority, -Components) is det
uri_authority_components(-Authority, +Components) is det
Break-down the authority component of a URI. The fields of the structure Components can be accessed using uri_authority_data/3. This predicate deals with IPv6 addresses written as [ip], returning the ip as host, without the enclosing []. When constructing an authority string and the host contains :, the host is embraced in []. If [] is not used correctly, the behavior should be considered poorly defined. If there is no balancing `]` or the host part does not end with `]`, these characters are considered normal characters and part of the (invalid) host name.
 uri_authority_data(+Field, ?Components, ?Data) is semidet
Provide access the uri_authority structure. Defined field-names are: user, password, host and port
  286uri_authority_data(user,     uri_authority(U, _, _, _), U).
  287uri_authority_data(password, uri_authority(_, P, _, _), P).
  288uri_authority_data(host,     uri_authority(_, _, H, _), H).
  289uri_authority_data(port,     uri_authority(_, _, _, P), P).
 uri_encoded(+Component, +Value, -Encoded) is det
uri_encoded(+Component, -Value, +Encoded) is det
Encoded is the URI encoding for Value. When encoding (Value->Encoded), Component specifies the URI component where the value is used. It is one of query_value, fragment, path or segment. Besides alphanumerical characters, the following characters are passed verbatim (the set is split in logical groups according to RFC3986).
query_value, fragment
"-._~" | "!$'()*,;" | "@" | "/?"
path
"-._~" | "!$&'()*,;=" | "@" | "/"
segment
"-._~" | "!$&'()*,;=" | "@"
 uri_iri(+URI, -IRI) is det
uri_iri(-URI, +IRI) is det
Convert between a URI, encoded in US-ASCII and an IRI. An IRI is a fully expanded Unicode string. Unicode strings are first encoded into UTF-8, after which %-encoding takes place.
Errors
- syntax_error(Culprit) in mode (+,-) if URI is not a legally percent-encoded UTF-8 string.
 uri_file_name(+URI, -FileName) is semidet
uri_file_name(-URI, +FileName) is det
Convert between a URI and a local file_name. This protocol is covered by RFC 1738. Please note that file-URIs use absolute paths. The mode (-, +) translates a possible relative path into an absolute one.
  328uri_file_name(URI, FileName) :-
  329    nonvar(URI),
  330    !,
  331    uri_components(URI, Components),
  332    uri_data(scheme, Components, File), File == file,
  333    uri_data(authority, Components, Host),
  334    my_host(Host),
  335    uri_data(path, Components, FileNameEnc),
  336    uri_encoded(path, FileName0, FileNameEnc),
  337    delete_leading_slash(FileName0, FileName).
  338uri_file_name(URI, FileName) :-
  339    nonvar(FileName),
  340    !,
  341    absolute_file_name(FileName, Path0),
  342    ensure_leading_slash(Path0, Path),
  343    uri_encoded(path, Path, PathEnc),
  344    uri_data(scheme, Components, file),
  345    uri_data(authority, Components, ''),
  346    uri_data(path, Components, PathEnc),
  347    uri_components(URI, Components).
  348
  349my_host('') :- !.
  350my_host(localhost) :- !.
  351:- if(exists_source(library(socket))).  352my_host(Host) :-
  353    gethostname(Host).
  354:- endif.
 ensure_leading_slash(+WinPath, -Path)
 delete_leading_slash(+Path, -WinPath)
Deal with the fact that absolute paths in Windows start with a drive letter rather than a /. For URIs we need a path that starts with a /.
  363ensure_leading_slash(Path, SlashPath) :-
  364    (   sub_atom(Path, 0, _, _, /)
  365    ->  SlashPath = Path
  366    ;   atom_concat(/, Path, SlashPath)
  367    ).
  368
  369:- if(current_prolog_flag(windows, true)).  370delete_leading_slash(Path, WinPath) :-
  371    atom_concat(/, WinPath, Path),
  372    is_absolute_file_name(WinPath),
  373    !.
  374:- endif.  375delete_leading_slash(Path, Path).
  376
  377
  378		 /*******************************
  379		 *          MODIFYING           *
  380		 *******************************/
 uri_edit(+Actions, +URI0, -URI) is det
Modify a URI according to Actions. Actions is either a single action or a (nested) list of actions. Defined primitive actions are:
scheme(+Scheme)
Set the Scheme of the URI (typically http, https, etc.)
user(+User)
Add/set the user of the authority component.
password(+Password)
Add/set the password of the authority component.
host(+Host)
Add/set the host (or ip address) of the authority component.
port(+Port)
Add/set the port of the authority component.
path(+Path)
Set/extend the path component. If Path is not absolute it is taken relative to the path of URI0.
search(+KeyValues)
Extend the Key=Value pairs of the current search (query) component. New values replace existing values. If KeyValues is written as =(KeyValues) the current search component is ignored. KeyValues is a list, whose elements are one of Key=Value, Key-Value or `Key(Value)`.
fragment(+Fragment)
Set the Fragment of the uri.
nid(+NID)
Set the Namespace Identifier for a URN URI.
nss(+NSS)
Set the Namespace Specific String for a URN URI.

Components can be removed by using a variable as value, except from path which can be reset using path(/) and query which can be dropped using query(=([])).

Arguments:
URI0- is either a valid uri or a variable to start fresh.
  420uri_edit(Actions, URI0, URI) :-
  421    (   var(URI0)
  422    ->  URI1 = '/'
  423    ;   URI1 = URI0
  424    ),
  425    uri_components(URI1, Comp0),
  426    edit_components(Actions, Comp0, Comp),
  427    uri_components(URI, Comp).
  428
  429edit_components([], Comp0, Comp) =>
  430    Comp = Comp0.
  431edit_components([H|T], Comp0, Comp) =>
  432    edit_components(H, Comp0, Comp1),
  433    edit_components(T, Comp1, Comp).
  434edit_components(scheme(Scheme), Comp0, Comp) =>
  435    uri_data(scheme, Comp0, Scheme, Comp).
  436edit_components(path(Path), Comp0, Comp) =>
  437    uri_data(path, Comp0, Path0),
  438    (   (   var(Path0)
  439        ;   Path0 == ''
  440        )
  441    ->  Path1 = '/'
  442    ;   Path1 = Path0
  443    ),
  444    uri_normalized(Path, Path1, Path2),
  445    uri_data(path, Comp0, Path2, Comp).
  446edit_components(fragment(Fragment), Comp0, Comp) =>
  447    uri_data(fragment, Comp0, Fragment, Comp).
  448edit_components(Authority, Comp0, Comp),
  449  authority_field(Authority) =>
  450    uri_data(authority, Comp0, Auth0),
  451    (   var(Auth0)
  452    ->  true
  453    ;   uri_authority_components(Auth0, AComp0)
  454    ),
  455    edit_auth_components(Authority, AComp0, AComp),
  456    uri_authority_components(Auth, AComp),
  457    uri_data(authority, Comp0, Auth, Comp).
  458edit_components(query(Search), Comp0, Comp) =>
  459    edit_components(search(Search), Comp0, Comp).
  460edit_components(search(=(Search)), Comp0, Comp) =>
  461    uri_query_components(String, Search),
  462    uri_data(search, Comp0, String, Comp).
  463edit_components(search(Search), Comp0, Comp) =>
  464    uri_data(search, Comp0, SS0),
  465    (   var(SS0)
  466    ->  Search0 = []
  467    ;   uri_query_components(SS0, Search0)
  468    ),
  469    join_search(Search0, Search, Search1),
  470    uri_query_components(SS1, Search1),
  471    uri_data(search, Comp0, SS1, Comp).
  472edit_components(nid(NID), Comp0, Comp) =>
  473    uri_data(fragment, Comp0, NID, Comp).
  474edit_components(nss(NSS), Comp0, Comp) =>
  475    uri_data(fragment, Comp0, NSS, Comp).
  476edit_components(Other, _, _) =>
  477    domain_error(uri_edit, Other).
  478
  479authority_field(user(_)).
  480authority_field(password(_)).
  481authority_field(host(_)).
  482authority_field(port(_)).
  483
  484edit_auth_components(user(User),
  485		     uri_authority(_, Passwd, Host, Port),
  486		     uri_authority(User, Passwd, Host, Port)).
  487edit_auth_components(password(Passwd),
  488		     uri_authority(User, _, Host, Port),
  489		     uri_authority(User, Passwd, Host, Port)).
  490edit_auth_components(host(Host),
  491		     uri_authority(User, Passwd, _, Port),
  492		     uri_authority(User, Passwd, Host, Port)).
  493edit_auth_components(port(Port),
  494		     uri_authority(User, Passwd, Host, _),
  495		     uri_authority(User, Passwd, Host, Port)).
  496
  497join_search([], Search, Search).
  498join_search([N=_|ST], New, Search) :-
  499    (   memberchk(N=_, New)
  500    ->  true
  501    ;   functor(T, N, 1),
  502	memberchk(T, New)
  503    ->  true
  504    ;   memberchk(N-_, New)
  505    ),
  506    !,
  507    join_search(ST, New, Search).
  508join_search([H|ST], New, [H|Search]) :-
  509    join_search(ST, New, Search).
  510
  511
  512                 /*******************************
  513                 *            SANDBOX           *
  514                 *******************************/
  515
  516:- multifile sandbox:safe_primitive/1.  517
  518sandbox:safe_primitive(uri:uri_components(_,_)).
  519sandbox:safe_primitive(uri:uri_normalized(_,_)).
  520sandbox:safe_primitive(uri:iri_normalized(_,_)).
  521sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  522sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  523sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  524sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  525sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  526sandbox:safe_primitive(uri:uri_is_global(_)).
  527sandbox:safe_primitive(uri:uri_query_components(_,_)).
  528sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  529sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  530sandbox:safe_primitive(uri:uri_iri(_,_))