% This file is part of the Attempto Parsing Engine (APE). % Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch). % % The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it % under the terms of the GNU Lesser General Public License as published by the Free Software % Foundation, either version 3 of the License, or (at your option) any later version. % % The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT % ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR % PURPOSE. See the GNU Lesser General Public License for more details. % % You should have received a copy of the GNU Lesser General Public License along with the Attempto % Parsing Engine (APE). If not, see http://www.gnu.org/licenses/. :- module(tokenizer, [ tokenize/2 ]). :- use_module('../lexicon/chars', [ is_letter/1, is_digit/1 ]). :- use_module('../logger/error_logger', [ add_error_message_once/4, add_warning_message_once/4 ]). :- use_module('../utils/ace_niceace', [ pronoun_split/2 ]). /** APE Tokenizer @author Kaarel Kaljurand @author Tobias Kuhn @version 2010-03-28 Comments: * Strings (between double quotes) are tokenized into [", content, "]. This can be misleading, e.g. in the sentence: `"1" represents 1.' the verb `represents' is reported as the 4th token. One could instead produce the term string(content), this would probably also fix the buggy handling of `Every dot is ".".'. * Saxon Genitives are tokenized as [noun, ', s] and [nouns, '] * Digits cannot start a token that contains other symbols than digits and dots. Consider the following input string and its tokenization. == 123man man123 123man123 man123man [123, man, man123, 123, man123, man123man] == BUG: future work: add character counting to be able to report character offsets together with every token. Example: ?- tokenizer:codes_to_tokens("2men;can't like\"#@\"and:everything@.", T), writeq(T). [2, men, can, not, like, '"#@"', and, :, every, '-thing', '.'] */ %% tokenize(+ACEText:term, -Tokens:list) is det. % % Breaks the ACEText (either an atom, or a list of character codes) into % a list of tokens (atoms or numbers). % The input ACEText can either be an atom like 'this is an example' or a list of character % codes like [116,104,105,115,32,105,...] (possibly written as "this is an example"). % % Tokenization will never fail. In case something goes wrong (e.g. a string % or comment is not closed) then an error message is asserted. % % @param ACEText is the input text, it is either an atom or a string % @param Tokens is a list of tokens, i.e. the tokenization of the input text % tokenize([], []) :- !. tokenize(String, Tokens) :- string(String), string_to_atom(String,Atom),!, tokenize(Atom, Tokens). tokenize(Atom, Tokens) :- atom(Atom), !, atom_codes(Atom, Codes), codes_to_tokens(Codes, Tokens). tokenize([C | Cs], Tokens) :- integer(C), maplist(integer,Cs),!, codes_to_tokens([C | Cs], Tokens). tokenize([NC | NCs], Tokens) :- is_char_code(NC), maplist(is_char_code,NCs),!, name([NC | NCs],[C | Cs]), codes_to_tokens([C | Cs], Tokens). tokenize(List, List):- assertion(is_list(List)),!. is_char_code(A):- atom(A),atom_length(A,1). %% codes_to_tokens(+Codes:list, -Tokens:list) is det. % % Maps the given list of character codes into a list of tokens. % Performs two steps: % % 1. Merges certain code sequences into single tokens (atoms or numbers). % 2. Modifies certain token sequences, e.g. [can, ', t] -> [can, not]. % % @param Codes is a list of character codes % @param Tokens is a list of tokens, i.e. the tokenization of the input text % codes_to_tokens(Codes, Tokens) :- get_atomics(Codes, Atomics), expand_contracted_forms(Atomics, Tokens). %% get_atomics(+Codes:list, -Tokens:list) is det. % % BUG: We could also preserve the comment in the token list % to be able to show better error messages. % get_atomics([], []). % A paragraph break starts with \n followed by whitespace, % including at least one more \n. % If another character is encountered first then the paragraph is not broken. get_atomics([10 | Cs], AllTokens) :- get_whitespace_and_comments(Cs, Remaining, Newline_Count), !, add_paragraph_break_symbol(Newline_Count, Ts, AllTokens), get_atomics(Remaining, Ts). % Quoted string starts with " (34) get_atomics([34 | Cs], AllTokens) :- !, get_string(Cs, Prefix, Remaining), string_to_token(Prefix, Ts, AllTokens), get_atomics(Remaining, Ts). % Quoted word starts with ` (96) get_atomics([96 | Cs], [T | Ts]) :- !, get_qword(Cs, Prefix, Remaining), atom_codes(T, Prefix), get_atomics(Remaining, Ts). % Perl-style comment starts with # (35) get_atomics([35 | Cs], Ts) :- !, get_perl_comment(Cs, _Prefix, Remaining), get_atomics(Remaining, Ts). % C-style comment starts with /* (47, 42) get_atomics([47, 42 | Cs], Ts) :- !, get_c_comment(Cs, _Prefix, Remaining), get_atomics(Remaining, Ts). % Whitespace (\t, ' ', \r) is ignored (whitespace does not include \n) get_atomics([C | Cs], Ts) :- is_whitespace(C), !, get_atomics(Cs, Ts). % Positive number starts with a digit get_atomics([C | Cs], [Number | Ts]) :- is_digit(C), !, get_number(Cs, Prefix, Remaining), number_codes(Number, [C | Prefix]), get_atomics(Remaining, Ts). % Negative number starts with a hyphen (45) and then a digit, % e.g. numbers like `-.5' are not allowed. get_atomics([45, C | Cs], ['-', Number | Ts]) :- is_digit(C), !, get_number(Cs, Prefix, Remaining), number_codes(Number, [C | Prefix]), get_atomics(Remaining, Ts). % Word starts (see is_word_char/1) get_atomics([C | Cs], AllTokens) :- is_word_char(C), !, get_word(Cs, Prefix, Remaining), split_token([C | Prefix], Ts, AllTokens), get_atomics(Remaining, Ts). % Special character maps to a one-character token get_atomics([C | Cs], [T | Ts]) :- is_special(C, T), !, get_atomics(Cs, Ts). % All other characters (e.g. Japanese) are ignored, with a warning message. get_atomics([C | Cs], Ts) :- with_output_to(atom(CharCode), format("~c (0x~16r, ~10r)", [C, C, C])), add_warning_message_once(character, '', CharCode, 'Unknown character(s) ignored.'), get_atomics(Cs, Ts). %% split_token(+Codes:list, +Ts:list, -FinalTokens:list) is det. % % Builds a token (atom) and splits it into two if needed, % e.g. 'Everything' is split into 'Every' and '-thing'. % split_token(Codes, Ts, FinalTokens) :- atom_codes(A, Codes), ( pronoun_split(A, (A1, A2)) -> FinalTokens = [A1, A2 | Ts] ; FinalTokens = [A | Ts] ). %% is_whitespace(?Code) % % Note that newlines are handled elsewhere. % is_whitespace(32). % ' ' is_whitespace(9). % \t is_whitespace(13). % \r %% is_word_char(?Code) % % Characters that are allowed in ACE words. % % - (hyphen) is_word_char(45). % _ (underscore) is_word_char(95). % $ (dollar) is_word_char(36). % degree sign is_word_char(176). % letters is_word_char(Code) :- is_letter(Code). %% is_special(?Code, ?Atom) is det. % % Characters that translate into one-character atoms. % % @param Code is character code % @param Atom is a one-character atom that corresponds to the code % period, question mark, exclamation mark is_special(46, '.'). is_special(63, '?'). is_special(33, '!'). % hyphen is_special(45, '-'). % colon (for prefixed words' support) is_special(58, ':'). % apostroph (for Saxon genitive support) is_special(39, '\''). % slash (for him/her support) is_special(47, '/'). % comma (for comma-and, comma-or support) is_special(44, ','). % plus sign is_special(43, '+'). % exponentiation sign is_special(94, '^'). % star is_special(42, '*'). % parentheses () is_special(40, '('). is_special(41, ')'). % square brackets [] is_special(91, '['). is_special(93, ']'). % curly bracktes {} is_special(123, '{'). is_special(125, '}'). % < = > \ is_special(60, '<'). is_special(61, '='). is_special(62, '>'). is_special(92, '\\'). % ampersand is_special(38, '&'). %% get_string(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes the sequence of characters within a quoted string, % as well as the closing quotation mark. % The backslash (\, 92) can be used to escape the following character, % e.g. the quotation mark or a backslash. % get_string([], [], []) :- add_error_message_once(character, '', 'EOF', 'Every quoted string must end with ".'). get_string([34 | Cs], [], Cs) :- !. get_string([92, C | Cs], [C | Prefix], Remaining) :- !, get_string(Cs, Prefix, Remaining). get_string([C | Cs], [C | Prefix], Remaining) :- get_string(Cs, Prefix, Remaining). %% get_qword(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes the sequence of characters within a quoted word, % as well as the closing backtick. % The backslash (\, 92) can be used to escape the following character, % e.g. the quotation mark or a backslash. % get_qword([], [], []) :- add_error_message_once(character, '', 'EOF', 'Every quoted word must end with `.'). get_qword([96 | Cs], [], Cs) :- !. get_qword([92, C | Cs], [C | Prefix], Remaining) :- !, get_qword(Cs, Prefix, Remaining). get_qword([C | Cs], [C | Prefix], Remaining) :- get_qword(Cs, Prefix, Remaining). %% get_word(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes the word consisting of word characters % and/or digits. % get_word([], [], []). get_word([C | Cs], [C | Prefix], Remaining) :- is_word_char(C), !, get_word(Cs, Prefix, Remaining). get_word([C | Cs], [C | Prefix], Remaining) :- is_digit(C), !, get_word(Cs, Prefix, Remaining). get_word([C | Cs], [], [C | Cs]). %% get_number(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes a number which is a sequence of digits containing % at most one dot (46). % The dot (if present) must be followed by a digit. % get_number(Codes, Prefix, RemainingCodes) :- get_number_x(Codes, zero, Prefix, RemainingCodes). get_number_x([], _, [], []). get_number_x([46, C | Cs], zero, [46, C | Prefix], Remaining) :- is_digit(C), !, get_number_x(Cs, more_than_zero, Prefix, Remaining). get_number_x([C | Cs], Dot_Count, [C | Prefix], Remaining) :- is_digit(C), !, get_number_x(Cs, Dot_Count, Prefix, Remaining). get_number_x([C | Cs], _, [], [C | Cs]). %% get_perl_comment(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes the Perl-style comment excluding the final newline. % get_perl_comment([], [], []) :- add_error_message_once(character, '', 'EOF', 'Every #-comment must end with the newline.'). get_perl_comment([10 | Cs], [], [10 | Cs]) :- !. get_perl_comment([C | Cs], [C | Prefix], Remaining) :- get_perl_comment(Cs, Prefix, Remaining). %% get_c_comment(+Codes:list, -Prefix:list, -RemainingCodes:list) is det. % % Consumes the C-style comment including the final */. % get_c_comment([], [], []) :- add_error_message_once(character, '', 'EOF', 'Every /*-comment must end with */.'). get_c_comment([42, 47 | Cs], [], Cs) :- !. get_c_comment([C | Cs], [C | Prefix], Remaining) :- get_c_comment(Cs, Prefix, Remaining). %% string_to_token % % The content of the string is bordered by quotation marks % in the token list. % % BUG: I'd rather prefer a representation such as % string('') or string(content). % string_to_token(Prefix, Ts, [T | Ts]) :- atom_codes(S, Prefix), concat_atom(['"', S, '"'], T). %% add_paragraph_break_symbol % add_paragraph_break_symbol(at_least_two, Ts, ['

' | Ts]) :- !. add_paragraph_break_symbol(_, Ts, Ts). %% get_whitespace_and_comments % % Consuming whitespace and counting the newlines. % get_whitespace_and_comments(Cs, Remaining, Newline_Count) :- get_whitespace_and_comments(Cs, one, Remaining, Newline_Count). get_whitespace_and_comments([], Newline_Count, [], Newline_Count). get_whitespace_and_comments([10 | Cs], _, Remaining, Newline_Count) :- !, get_whitespace_and_comments(Cs, at_least_two, Remaining, Newline_Count). get_whitespace_and_comments([C | Cs], Newline_Count, Remaining, Final_Newline_Count) :- is_whitespace(C), !, get_whitespace_and_comments(Cs, Newline_Count, Remaining, Final_Newline_Count). /* % Perl-style comment starts get_whitespace_and_comments([35 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :- !, get_perl_comment(Cs, _Prefix, Remaining), get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count). % C-style comment starts get_whitespace_and_comments([47, 42 | Cs], Newline_Count, Final_Remaining, Final_Newline_Count) :- !, get_c_comment(Cs, _Prefix, Remaining), get_whitespace_and_comments(Remaining, Newline_Count, Final_Remaining, Final_Newline_Count). */ get_whitespace_and_comments([C | Cs], Newline_Count, [C | Cs], Newline_Count). %% expand_contracted_forms(+TokenListIn:list, -TokenListOut:list) is det. % % @bug: `cannot' could be instead handled during (pronoun) splitting % expand_contracted_forms([], []). expand_contracted_forms(['No', one | RestIn], ['No', '-one' | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([no, one | RestIn], [no, '-one' | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([isn, '\'', t | RestIn], [is, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([aren, '\'', t | RestIn], [are, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([doesn, '\'', t | RestIn], [does, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([don, '\'', t | RestIn], [do, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([can, '\'', t | RestIn], [can, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([cannot | RestIn], [can, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([shouldn, '\'', t | RestIn], [should, not | RestOut]) :- !, expand_contracted_forms(RestIn, RestOut). expand_contracted_forms([Token | TailIn], [Token | TailOut]) :- expand_contracted_forms(TailIn, TailOut).