1:- module(dbpedia_matcher,
2 [band/1,
3 has_shared_band_member/3,
4 has_shared_genre/3,
5 similarity_by_genre/3,
6 jaccard/3,
7 pair_genre_sum_ic/3]).
19% now builtin 20%:- use_module(library(tabling)). 21:- use_module(library(sparqlprog/emulate_builtins)). 22:- use_module(library(sparqlprog/ontologies/dbpedia)).
true if SharedMember is a member of both B1 and B2 (and B1 and B2 are distinct)
31has_shared_band_member(B1,B2,A) :-
32 rdf(A,dbont:associatedBand,B1),
33 rdf(A,dbont:associatedBand,B2),
34 B1\=B2.
true if SharedGenre is a genre of both B1 and B2 (and B1 and B2 are distinct)
42has_shared_genre(B1,B2,A) :- 43 rdf(B1,dbont:genre,A), 44 rdf(B2,dbont:genre,A), 45 B1\=B2. 46 47genre_pair(G1,G2,A) :- 48 rdf(A,dbont:genre,G1), 49 rdf(A,dbont:genre,G2), 50 G1\=G2.
the entities should be of similar types (e.g. two bands, or two books)
| genres(A) /\ genres(B) | / | genres(A) \/ genres(B) |
if no genres are in common, then this should equal 0 if all genres are in common, then this should equal 1
note this does not take into account how meaningful it is for a genre to be shared; e.g. sharing the common genre 'pop' counts as much as a rarer genre like 'psytrance'. see further on for IC-based metrics.
71similarity_by_genre(A,B,Sim) :- 72 get_all_genres(A,SA), 73 get_all_genres(B,SB), 74 jaccard(SA,SB,Sim). 75 76jaccard(SA,SB,Sim) :- 77 ord_intersection(SA,SB,I), 78 ord_union(SA,SB,U), 79 length(I,NI), 80 length(U,NU), 81 Sim is NI/NU. 82 83get_all_genres(Entity,L) :- 84 service_query_all(dbpedia,G,rdf(Entity,dbont:genre,G),L). 85 86 87 88:- table get_num_bands/1.
note this is tabled (cached) so that repeated calls do not invoke new SPARQL queries
95get_num_bands(Count) :- 96 ??(dbpedia,num_bands(Count)). 97num_bands(Count) :- 98 aggregate(count(distinct(B)),band(B),Count).
%%%:- table get_genre_num_bands/2.
109get_genre_num_bands(G,Count) :- 110 ??(dbpedia,genre_num_bands(G,Count)). 111 112genre_num_bands(G,Count) :- 113 aggregate_group(count(distinct(B)),[G],(rdf(B,dbont:genre,G),band(B)),Count).
Example: =pair_genre_ic(dbr:'Metallica', dbr:'Megadeth', IC)
=
123pair_genre_sum_ic(A,B,SumIC) :-
124 get_all_genres(A,SA),
125 ??(dbpedia,(band(B),has_shared_genre(A,B,_))),
126 get_all_genres(B,SB),
127 ord_intersection(SA,SB,I),
128 debug(dbpedia,'~w vs ~w :: INTERSECTION(~w + ~w) = ~w',[A,B,SA,SB,I]),
129 aggregate(sum(IC),G^(member(G,I),genre_ic(G,IC)),SumIC).
for example, many bands are pop, so this would have a low IC. Progressive sludge metal is relatively rare and would have a high IC
InformationContent = -log2( Pr(Genre) )
141genre_ic(G,IC) :-
142 get_genre_num_bands(G,Count),
143 debug(dbpedia,'|bands| in ~w = ~w',[G,Count]),
144 get_num_bands(Total),
145 debug(dbpedia,'Total bands = ~w',[Total]),
146 seval(-log(Count/Total)/log(2), IC)
perform entity matching/similarity using dbpedia
This module is intended primarily as an example of how to mix local logic and remote sparql logic.
*/