1:- module(atom_feed, [ new_feed/2 2 , author/2 3 , content/2 4 , description/2 5 , email/2 6 , entry/2 7 , href/2 8 , id/2 9 , link/2 10 , name/2 11 , published/2 12 , rel/2 13 , summary/2 14 , title/2 15 , type/2 16 , updated/2 17 ] 18 ). 19 20:- use_module(library(charsio), [open_chars_stream/2]). 21:- use_module(library(delay)). 22:- use_module(library(http/http_open), [http_open/3]). 23:- use_module(library(http/http_ssl_plugin)). % for SSL 24:- use_module(library(sgml), [load_sgml/3]). 25:- use_module(library(xpath)). 26 27 28:- multifile delay:mode/1. 29delaymode(system:atomic_list_concat(ground,ground,_)). 30delaymode(system:atomic_list_concat(_,ground,ground)).
stream(Stream)
- an input streamfile(File)
- a file name to read fromatom(Atom)
- Atom or RSS XML in an atomcodes(Codes)
- Atom or RSS XML in a list of codesurl(Url)
- a URL to fetchThis is the first step in working with an Atom or RSS feed. If an RSS feed has multiple channels, only the first channel is considered.
45new_feed(stream(Stream), Feed) :- 46 parse_xml(Stream, Flavor, Tree), 47 wrap_feed(Flavor, Tree, Feed). 48new_feed(file(File), Feed) :- 49 parse_xml(File, Flavor, Tree), 50 wrap_feed(Flavor, Tree, Feed). 51new_feed(atom(Atom), Feed) :- 52 atom_codes(Atom, Codes), 53 new_feed(codes(Codes), Feed). 54new_feed(codes(Codes), Feed) :- 55 open_chars_stream(Codes, Stream), 56 new_feed(stream(Stream), Feed). 57new_feed(url(Url), Feed) :- 58 setup_call_cleanup( http_open(Url, Stream, [ timeout(10) 59 , cert_verify_hook(ssl_verify) 60 ]) 61 , new_feed(stream(Stream), Feed) 62 , close(Stream) 63 ). 64 65% accept all SSL certificates 66ssl_verify( _SSL 67 , _ProblemCertificate 68 , _AllCertificates 69 , _FirstCertificate 70 , _Error 71 ). 72 73 74% convenience for new_feed/2 when parsing XML 75parse_xml(Source, Flavor, Tree) :- 76 load_sgml( Source 77 , Parts 78 , [ dialect(xmlns) 79 , call(urlns, url_ns) 80 %, call(xmlns, on_xmlns) % to be notified of xmlns declarations 81 ] 82 ), 83 member(Root-Flavor, [(atom:feed)-atom, rss-rss]), 84 Tree = element(Root,_Attrs,_Children), 85 memberchk(Tree, Parts), 86 !. % accept the first solution 87 88 89 90wrap_feed(atom, Tree, atom_feed(Tree)). 91wrap_feed(rss, RssTree, rss_feed(ChannelTree)) :- 92 once(xpath(RssTree, /rss/channel, ChannelTree)). 93 94 95% map URLs to namespace prefixes (for convenience) 96url_ns('http://www.w3.org/2005/Atom', atom, _). 97 98 99% convenience for matching XPath expressions w/o needing matched element 100xpath(Dom, Spec) :- 101 xpath(Dom, Spec, _).
108id(atom_feed(Dom), IdText) :- 109 once(xpath(Dom, /(atom:feed)/(atom:id), Id)), 110 once(xpath(Id, /'*'(text), IdText)). 111id(atom_entry(Dom), IdText) :- 112 once(xpath(Dom, /(atom:entry)/(atom:id), Id)), 113 once(xpath(Id, /'*'(text), IdText)). 114id(rss_entry(Dom), IdText) :- 115 once(xpath(Dom, /item/guid(text), IdText)).
123author(atom_feed(Dom), atom_author(Author)) :- 124 xpath(Dom, /(atom:feed)/(atom:author), Author). 125author(atom_entry(Dom), atom_author(Author)) :- 126 xpath(Dom, /(atom:entry)/(atom:author), Author). 127author(rss_entry(Dom), rss_author(Author)) :- 128 xpath(Dom, /item/author, Author).
type
or src
attributes.135content(atom_entry(Entry), ContentText) :- 136 once(xpath(Entry, /(atom:entry)/(atom:content), Content)), 137 once(xpath(Content, /'*'(text), ContentText)). 138content(rss_entry(E), ContentText) :- 139 Entry = rss_entry(E), 140 \+ link(Entry, _), % no link means description is the content 141 description(Entry, ContentText). 142 143 144%% description(+Entry, -Description:atom) is semidet. 145% 146% True if Entry has a Description. This predicate is peculiar to RSS 147% entries. The RSS spec confounds summary and content into a single 148% 'description' field. 149description(rss_entry(Entry), Description) :- 150 once(xpath(Entry, /item/description(text), Description)).
It's not possible to reliably extract summaries from RSS entries. That's because summary and content is confounded into a single 'description' element.
162summary(atom_entry(Entry), SummaryText) :-
163 % summary on entry is optional. See RFC4287 4.2.13
164 once(xpath(Entry, /(atom:entry)/(atom:summary), Summary)),
165 once(xpath(Summary, /'*'(text), SummaryText)),
166
167 % must have text different from 'content' and 'title'
168 dif(SummaryText, Title),
169 title(atom_entry(Entry), Title),
170 ( content(atom_entry(Entry), Content) -> dif(SummaryText, Content) ; true ).
177title(atom_feed(Dom), TitleText) :- 178 once(xpath(Dom, /(atom:feed)/(atom:title), Title)), 179 once(xpath(Title, /'*'(text), TitleText)). 180title(rss_feed(Dom), TitleText) :- 181 once(xpath(Dom, /channel/title(text), TitleText)). 182title(atom_entry(Dom), TitleText) :- 183 xpath(Dom, /(atom:entry)/(atom:title), Title), 184 once(xpath(Title, /'*'(text), TitleText)). 185title(rss_entry(Dom), TitleText) :- 186 once(xpath(Dom, /item/title(text), TitleText)). 187title(atom_link(Dom), TitleText) :- 188 Dom = element(_,Attrs,_), 189 memberchk(title=TitleText, Attrs).
196entry(atom_feed(Dom), atom_entry(Entry)) :- 197 xpath(Dom, /(atom:feed)/(atom:entry), Entry). 198entry(rss_feed(Dom), rss_entry(Entry)) :- 199 xpath(Dom, /channel/item, Entry).
For example, to find the URL to an HTML alternative:
link(Feed, Link), rel(Link, alternate), type(Link, text/html), href(Link, Url).
214link(atom_feed(Dom), atom_link(Link)) :- 215 xpath(Dom, /(atom:feed)/(atom:link), Link). 216link(rss_feed(Dom), rss_link(Link)) :- 217 once(xpath(Dom, /channel/link, Link)). % spec says "the" suggesting 1 218link(atom_entry(Dom), atom_link(Link)) :- 219 xpath(Dom, /(atom:entry)/(atom:link), Link). 220link(rss_entry(Dom), rss_link(Link)) :- 221 xpath(Dom, /item/link, Link).
228published(atom_entry(Dom), Epoch) :- 229 once(xpath(Dom, /(atom:entry)/(atom:published), Published)), 230 xpath(Published, /'*'(text), Date), 231 parse_time(Date, Epoch). 232published(rss_entry(Dom),Epoch) :- 233 once(xpath(Dom, /item/pubDate(text), Date)), parse_time(Date, Epoch).
241updated(atom_entry(Dom), Epoch) :-
242 once(xpath(Dom, /(atom:entry)/(atom:updated), Updated)),
243 xpath(Updated, /'*'(text), Date),
244 parse_time(Date, Epoch).
250email(atom_author(Author), EmailText) :- 251 once(xpath(Author, /(atom:author)/(atom:email), Email)), 252 xpath(Email, /'*'(text), EmailText). 253email(rss_author(Author), Email) :- 254 once(xpath(Author, /author(text), Email)).
262name(atom_author(Author), NameText) :-
263 % name on author is mandatory. See RFC4287 3.2.1
264 once(xpath(Author, /(atom:author)/(atom:name), Name)),
265 xpath(Name, /'*'(text), NameText).
alternate
is used
instead. That's why this predicate's mode is det
.
Because RSS links can't specify a 'rel', we use alternate
for them
too.
276rel(atom_link(Link), Rel) :- 277 % rel on link is optional, defaults to "alternate". 278 % See RFC4287 4.2.7.2 279 Link = element(_,Attrs,_), 280 ( memberchk(rel=Rel0,Attrs) -> 281 true 282 ; true -> 283 Rel0 = alternate 284 ), 285 Rel = Rel0. 286rel(rss_link(_), alternate).
292href(atom_link(Link), Href) :- 293 % href on link is mandatory. See RFC4287 4.2.7.1 294 once(xpath(Link, /'*'(@href=Href))). 295href(rss_link(Link), Href) :- 296 Link = element(link, _, [Href]).
Type/Subtype
which allows one to do things like
type(Link, text/_) % link has a textual type
The Atom spec says that 'type' is optional. If it's missing, this
predicate fails. RSS spec says the link points to "the HTML website",
so MediaType is always text/html
.
Subtype may contain punctuation so remember to quote:
application/'atom+xml'
.
312type(atom_link(element(_,Attrs,_)), Type/Subtype) :- 313 delay(atomic_list_concat([Type, Subtype], '/', RawType)), 314 315 % type on link is optional, with no default. 316 % See RFC4287 4.2.7.3 317 memberchk(type=RawType, Attrs). 318type(rss_link(_), text/html)