% This file is part of the Attempto Parsing Engine (APE). % Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). % % The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it % under the terms of the GNU Lesser General Public License as published by the Free Software % Foundation, either version 3 of the License, or (at your option) any later version. % % The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR % PURPOSE. See the GNU Lesser General Public License for more details. % % You should have received a copy of the GNU Lesser General Public License along with the Attempto % Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. :- module(morphgen, [ clear_vars/0, add_var/1, remove_singletons/2, acesentencelist_pp/2, listlist_listatom/2, format_dr/2, surface_noun/4, surface_verb/3, surface_neg_verb/3, surface_adverb/3, surface_property/3, surface_property/6, surface_quotedstring/2, get_di_marker/4 ]). /** Morphological synthesis and text generation @author Kaarel Kaljurand @version 2010-04-20 */ :- use_module('../lexicon/lexicon_interface', []). :- use_module('../lexicon/chars', [ is_capitalized/1 ]). :- use_module(ace_niceace, [ ace_niceace/2, word_article/2, atom_capitalize/2 ]). :- dynamic var_once/1. :- dynamic var_twice/1. :- dynamic var/2. %% format_dr(+Dr:nvar, -NiceDr:atom) is det. % % @param Dr numbervared variable % @param NiceDr pretty-printed numbervared variable % format_dr(Dr, NiceDr) :- format(atom(NiceDr), '~W', [Dr, [numbervars(true)]]). %% surface_verb(+SgPlPart:atom, +Lemma:atom, -WordForm:atom) is det. % % Examples: % % * surface_verb(sg, know, knows) % * surface_verb(pl, know, know) % * surface_verb(part, know, known) % % BUGs: % % * reorder the arguments SgPl and Lemma % % * rewrite to support plural participles, i.e. we need 2 features: % number = {sg, pl}, participleness = {yes, no} % % * maybe this would be faster than concat_atom/3: % format(atom(WordForm), "is ~w by", [WordFormTmp]) % % * this is a quick hack to make ACE Core verbalization work with i-verbs. % % @param SgPlPart {sg, pl, part}, is the number of the verb or its participleness % @param Lemma lemma of the verb % @param WordForm is the synthesized form of the verb % surface_verb(_SgPl, i(Lemma), WordForm) :- !, surface_verb_x(part, Lemma, WordFormTmp), concat_atom([is, WordFormTmp, by], ' ', WordForm). surface_verb(SgPl, Lemma, WordForm) :- surface_verb_x(SgPl, Lemma, WordForm). %% surface_verb_x(+SgPlPart:atom, +Lemma:atom, -WordForm:atom) is det. % % @param SgPlPart {sg, pl, part}, is the number of the verb or its participleness % @param Lemma lemma of the verb % @param WordForm is the synthesized form of the verb % surface_verb_x(sg, Verb, SurfaceVerb) :- verb_sg(Verb, SurfaceVerb). surface_verb_x(pl, Verb, SurfaceVerb) :- verb_pl(Verb, SurfaceVerb). surface_verb_x(part, Verb, VerbEd) :- lexicon_interface:tv_pp(VerbEd, Verb), !. surface_verb_x(part, Verb, VerbEd) :- lexicon_interface:dv_pp(VerbEd, Verb, _), !. surface_verb_x(part, Verb, [v, ':', Verb]). %% surface_neg_verb(+SgPl:atom, +Lemma:atom, -WordForm:list) is det. % % BUG: We should also support participles here. % % @param SgPl {sg, pl}, is the number of the negated verb % @param Lemma lemma of the negated verb % @param WordForm is the synthesized form of the verb % surface_neg_verb(sg, be, [is, not]) :- !. surface_neg_verb(pl, be, [are, not]) :- !. surface_neg_verb(sg, Verb, [does, not, WordForm]) :- !, surface_verb(pl, Verb, WordForm). surface_neg_verb(pl, Verb, [do, not, WordForm]) :- !, surface_verb(pl, Verb, WordForm). %% surface_adverb(+Adverb:atom, +Comparison:atom, -SurfaceText:list) is det. % % Note that comparative forms like 'faster', are lemmatized as 'fast'. % % @param Adverb is a lemma form of an adverb, as found in the DRS. % @param Comparision is one of {pos, comp, sup} % @param SurfaceText is a combination of the adverb and the comparision type. % surface_adverb(Adverb, pos, [Adverb]). surface_adverb(Adverb, comp, [CompAdverb]) :- lexicon_interface:adv_comp(CompAdverb, Adverb), !. surface_adverb(Adverb, comp, [more, Adverb]) :- lexicon_interface:adv(Adverb, _), !. surface_adverb(Adverb, comp, [more, a, ':', Adverb]). surface_adverb(Adverb, sup, [SupAdverb]) :- lexicon_interface:adv_sup(SupAdverb, Adverb), !. surface_adverb(Adverb, sup, [most, Adverb]) :- lexicon_interface:adv(Adverb, _), !. surface_adverb(Adverb, sup, [most, a, ':', Adverb]). %% surface_property(+Adjective:atom, +Comparison:atom, -SurfaceText:list) is det. % % Note that comparative forms like 'better', are lemmatized as 'good'. % % @param Adjective is a lemma form of an adjective, as found in the DRS. % @param Comparision is one of {pos, pos_as, comp, sup, comp_than} % @param SurfaceText is a combination of the adjective and the comparision type. % surface_property(Adjective, pos, [Adjective]) :- lexicon_interface:adj_itr(Adjective, _), !. surface_property(Adjective, pos, [Adjective]) :- lexicon_interface:adj_tr(Adjective, _, _), !. surface_property(Adjective, pos, [a, ':', Adjective]). surface_property(Adjective, pos_as, [as, Adjective, as]) :- lexicon_interface:adj_itr(Adjective, _), !. surface_property(Adjective, pos_as, [as, a, ':', Adjective, as]). surface_property(Adjective, comp, MoreAdjective) :- pos_comp(Adjective, MoreAdjective, _). surface_property(Adjective, sup, MostAdjective) :- pos_sup(Adjective, MostAdjective). surface_property(Adjective, comp_than, [MoreAdjective, than]) :- pos_comp(Adjective, MoreAdjective, _). %% surface_property(+Adjective:atom, +Comparison:atom, +ComparisonTarget:atom, +Ref2Text:list, +Ref3Text:list,-SurfaceText:list) is det. % % Note that comparative forms like 'fonder-of', are lemmatized as 'fond-of'. % % Examples: % % * [John is] as fond-of Mary as Bill % * [John is] as fond-of Mary as of Bill % * [John is] more fond-of Mary than Bill % * [John is] more fond-of Mary than of Bill % % @param Adjective is a lemma form of an adjective, as found in the DRS. % @param Comparision is one of {pos_as, comp_than} % @param ComparisionTarget is one of {subj, obj} % @param Ref2Text is the surface text of the 2nd NP argument of the property/6 % @param Ref3Text is the surface text of the 3rd NP argument of the property/6 % @param SurfaceText is a combination of the adjective, comparision type, % comparison target, and the surface texts of the arguments % surface_property(Adjective, pos_as, subj, Ref2Text, Ref3Text, [as, Adjective, Ref2Text, as, Ref3Text]). surface_property(Adjective, pos_as, obj, Ref2Text, Ref3Text, [as, Adjective, Ref2Text, as, Preposition, Ref3Text]) :- pos_comp(Adjective, _, Preposition). surface_property(Adjective, comp_than, subj, Ref2Text, Ref3Text, [MoreAdjective, Ref2Text, than, Ref3Text]) :- pos_comp(Adjective, MoreAdjective, _). surface_property(Adjective, comp_than, obj, Ref2Text, Ref3Text, [MoreAdjective, Ref2Text, than, Preposition, Ref3Text]) :- pos_comp(Adjective, MoreAdjective, Preposition). %% pos_comp(+Adjective:atom, -ComparativeAdjective:list, -Preposition:list) is det. % % Note that as with plural nouns, Clex does not tell us whether the surface form of an adjective (e.g. 'far') % is to be preferred over another surface form with the same lemma. % E.g. 'far' can be mapped to either 'further' or 'farther'. % % @param Adjective is an ACE adjective (e.g. 'good') % @param ComparativeAdjective is an ACE comparative adjective (e.g. 'better', 'more tall', 'fonder-of') % @param Preposition is the preposition of a transitive adjective (e.g. 'of' in case of 'fonder-of') % pos_comp(Positive, [Comparative], []) :- lexicon_interface:adj_itr_comp(Comparative, Positive), !. pos_comp(Positive, [Comparative], [Preposition]) :- lexicon_interface:adj_tr_comp(Comparative, Positive, Preposition), !. pos_comp(Positive, [more, Positive], []) :- lexicon_interface:adj_itr(Positive, _), !. pos_comp(Positive, [more, Positive], [Preposition]) :- lexicon_interface:adj_tr(Positive, _, Preposition), !. % If everything else fails, then we add a prefix. pos_comp(Positive, [more, a, ':', Positive], []). %% pos_sup(+Adjective:atom, -SuperlativeAdjective:list) is det. % % @param Adjective is an ACE adjective (e.g. 'good') % @param SuperlativeAdjective is an ACE superlative adjective (e.g. 'best', 'most tall') % pos_sup(Positive, [Superlative]) :- lexicon_interface:adj_itr_sup(Superlative, Positive), !. pos_sup(Positive, [Superlative]) :- lexicon_interface:adj_tr_sup(Superlative, Positive, _), !. pos_sup(Positive, [most, Positive]) :- lexicon_interface:adj_itr(Positive, _), !. pos_sup(Positive, [most, Positive]) :- lexicon_interface:adj_tr(Positive, _, _), !. % If everything else fails, then we add a prefix. pos_sup(Positive, [most, a, ':', Positive]). %% surface_quotedstring(+DrsString:atomic, -QuotedString:atom) is det. % % Adds quotes around the atom within the DRS string(.), % escaping also double quotes and backslashes. % The result is an ACE quoted string. % % @param DrsString e.g. 'a\\b"c' % @param QuotedString e.g. '"a\\\\b\"c"' % surface_quotedstring(DrsString, QuotedString) :- atom_codes(DrsString, Codes1), quote_and_escape(Codes1, Codes2), atom_codes(QuotedString, Codes2). %% quote_and_escape(+CodesIn:list, -CodesOut:list) is det. % % Escapes quote (34) and backslash (92) characters in the input list of codes, % borders the resulting list with quotes (34). % quote_and_escape(Cs, [34 | QCs]) :- quote_and_escape_(Cs, QCs). quote_and_escape_([], [34]). quote_and_escape_([92 | Cs], [92, 92 | QCs]) :- !, quote_and_escape_(Cs, QCs). quote_and_escape_([34 | Cs], [92, 34 | QCs]) :- !, quote_and_escape_(Cs, QCs). quote_and_escape_([C | Cs], [C | QCs]) :- quote_and_escape_(Cs, QCs). %% surface_noun(+Type:atom, +Lemma:atom, ?Num:atom, -Form:term) % % Examples: % % * surface_noun(cn, man, sg, man) % * surface_noun(cn, somebody, sg, somebody) % * surface_noun(cn, man, pl, men) % * surface_noun(cn, sand, mass, sand) % % Note: we do not add a prefix to unknown proper names unless they start with % a lowercase character. % % Note that in the presence of aliases, the mapping of lemmas to surface forms % is not deterministic. Consider e.g. the mapping of surface forms to lemmas: % % * abaci -> abacus % * abacuses -> abacus % % As Clex does not include information about the main word vs its alias, we % return whatever comes first. % % @param Type is one of {cn, pn}, i.e. common noun or proper name % @param Lemma is the lemma of the noun as found in the DRS % @param Num is one of {sg, mass, pl} % @param Form is the surface form of the noun (possibly a list of tokens) % % @bug: possibly slow, because uses an unindexed argument % surface_noun(cn, Lemma, sg, Form) :- lexicon_interface:noun_sg(Form, Lemma, _), !. surface_noun(cn, Lemma, mass, Form) :- lexicon_interface:noun_mass(Form, Lemma, _), !. surface_noun(cn, Lemma, pl, Form) :- lexicon_interface:noun_pl(Form, Lemma, _), !. % If the common noun lemma does not exist in the lexicon % then output it with the n-prefix, regardless of % its number. surface_noun(cn, Lemma, _Num, [n, ':', Lemma]). surface_noun(pn, Lemma, sg, Form) :- lexicon_interface:pn_sg(Form, Lemma, _), !. surface_noun(pn, Lemma, sg, [the, Form]) :- lexicon_interface:pndef_sg(Form, Lemma, _), !. surface_noun(pn, Lemma, pl, Form) :- lexicon_interface:pn_pl(Form, Lemma, _), !. surface_noun(pn, Lemma, pl, [the, Form]) :- lexicon_interface:pndef_pl(Form, Lemma, _), !. surface_noun(pn, Lemma, sg, [p, ':', Lemma]) :- \+ is_capitalized(Lemma), !. surface_noun(pn, Lemma, pl, [p, ':', Lemma]) :- \+ is_capitalized(Lemma), !. surface_noun(pn, Lemma, sg, Lemma). surface_noun(pn, Lemma, pl, Lemma). %% verb_pl(+Lemma:atom, -Pl:atom) is det. % % @param Lemma is the lemma of the verb as found in the DRS % @param Pl is the plural form of the verb % verb_pl(be, are) :- !. % BUG: is this needed? verb_pl(Lemma, Pl) :- lexicon_interface:iv_infpl(Pl, Lemma), !. verb_pl(Lemma, Pl) :- lexicon_interface:tv_infpl(Pl, Lemma), !. verb_pl(Lemma, Pl) :- lexicon_interface:dv_infpl(Pl, Lemma, _), !. verb_pl(Lemma, [v, ':', Lemma]). %% verb_sg(+Lemma:atom, -Sg:atom) is det. % % @param Lemma is the lemma of the verb as found in the DRS % @param Sg is the singular form of the verb % verb_sg(be, is) :- !. % BUG: is this needed? verb_sg(Lemma, Sg) :- lexicon_interface:iv_finsg(Sg, Lemma), !. verb_sg(Lemma, Sg) :- lexicon_interface:tv_finsg(Sg, Lemma), !. verb_sg(Lemma, Sg) :- lexicon_interface:dv_finsg(Sg, Lemma, _), !. verb_sg(Lemma, [v, ':', Lemma]). %% get_di_marker(+SgPl:atom, +Lemma:atom, -SurfaceForm:term, -DiMarker:atom) % % Looks up the prepositional marker of a ditransitive verb % by its lemma and its number. % (Note that the ACE lexicon allows the % same lemma to have a different marker in % singular and in plural context. This is not % the case in English though.) % get_di_marker(sg, Lemma, Form, DiMarker) :- lexicon_interface:dv_finsg(Form, Lemma, DiMarker), DiMarker \= '', !. get_di_marker(sg, Lemma, Form, '') :- lexicon_interface:dv_finsg(Form, Lemma, ''), !. get_di_marker(pl, Lemma, Form, DiMarker) :- lexicon_interface:dv_infpl(Form, Lemma, DiMarker), DiMarker \= '', !. get_di_marker(pl, Lemma, Form, '') :- lexicon_interface:dv_infpl(Form, Lemma, ''), !. get_di_marker(_, Lemma, [v, ':', Lemma], ''). %% acesentencelist_pp(+AceList:list, -AceText:atom) is det. % % Expects the input list to be a list of atoms (ACE sentences) or a list % of lists of atoms (lists of ACE sentences). % Produces an ACE text (atom). % % @param AceList is a list of (lists of) ACE sentences (atoms) % @param AceText is an ACE text (atom) % acesentencelist_pp([], ''). acesentencelist_pp([X | Xs], Ace) :- is_list(X), % peek, if is paragraph !, maplist(acesentencelist_pp_x, [X | Xs], AceList), concat_atom(AceList, '\n\n', Ace). acesentencelist_pp([X | Xs], Ace) :- atom(X), % peek, if is sentence !, acesentencelist_pp_x([X | Xs], Ace). acesentencelist_pp_x([], '# ...') :- !. acesentencelist_pp_x(AceSentenceList, AceText) :- concat_atom(AceSentenceList, '\n', AceText). %% listlist_listatom(+ListOfList:list, -ListOfAtom:list) is det. % % @param ListOfList is a list (ACE sentences) of lists (ACE tokens) % @param ListOfAtom is a list of atoms (ACE sentences) % % @deprecated try to use ace_niceace:tokens_to_sentences/2 instead % % used only by Core ACE, NP ACE verbalizers, clean it up % listlist_listatom([], []). listlist_listatom([TokenList | Tail], [Atom | RestT]) :- ( TokenList = [] -> Atom = 'ERROR' ; sentence_type(TokenList, NewTokenList), ace_niceace(NewTokenList, [FirstToken | RestTokenList]), atom_capitalize(FirstToken, FirstTokenCapitalized), concat_atom([FirstTokenCapitalized | RestTokenList], ' ', Atom) ), listlist_listatom(Tail, RestT). %% sentence_type(+TokenList:list, -NewTokenList:list) is det. %% sentence_type(+TokenList:list, -Type:atom, -NewTokenList:list) is det. % % Assigns a type to the given sentence. Type is one of '.' and '?'. % % The qp/1 term contains a list of tokens that represent the % query word (e.g. [how, many]). % sentence_type(TokenList, NewTokenList) :- sentence_type(TokenList, _, NewTokenList). sentence_type([], '.', ['.']) :- !. sentence_type([], '?', ['?']). sentence_type([qp(QPhrase) | T], _, Tokens) :- !, sentence_type(T, '?', T2), append(QPhrase, T2, Tokens). sentence_type([H | T], Type, [H | T2]) :- sentence_type(T, Type, T2). %% clear_vars is det. % % Retracts dynamic predicates: var/2, var_once/1, var_twice/1. % clear_vars :- retractall(var(_, _)), retractall(var_once(_)), retractall(var_twice(_)). %% add_var(+Var:nvar) is det. % % Asserts the (numbervared) variable so that we know if it has been % used once or more times. % % @param Var is a numbervared discourse referent % add_var(Var) :- ( var_twice(Var) -> true ; ( var_once(Var) -> retract(var_once(Var)), assert(var_twice(Var)) ; assert(var_once(Var)) ) ). %% remove_singletons(+ListIn:list, -ListOut:list) is det. %% remove_singletons_x(+VarCount:integer, +ListIn:list, -ListOut:list) is det. % % remove_singletons(In, Out) :- remove_singletons_x(1, _, In, Out). remove_singletons_x(Count, Count, [], []). remove_singletons_x(CountIn, CountOut, [List | T], [RList | RT]) :- remove_singletons_x(CountIn, CountTmp, List, RList), remove_singletons_x(CountTmp, CountOut, T, RT). remove_singletons_x(CountIn, CountOut, ['$VAR'(N) | T], RT) :- var_once('$VAR'(N)), !, remove_singletons_x(CountIn, CountOut, T, RT). remove_singletons_x(CountIn, CountOut, ['$VAR'(N) | T], [NiceRef | RT]) :- var(N, VarNumber), !, format(atom(NiceRef), 'X~w', [VarNumber]), remove_singletons_x(CountIn, CountOut, T, RT). remove_singletons_x(CountIn, CountOut, ['$VAR'(N) | T], [NiceRef | RT]) :- !, assert(var(N, CountIn)), format(atom(NiceRef), 'X~w', [CountIn]), NewCount is CountIn + 1, remove_singletons_x(NewCount, CountOut, T, RT). remove_singletons_x(CountIn, CountOut, [H | T], [H | RT]) :- remove_singletons_x(CountIn, CountOut, T, RT).