1% This file is part of the Attempto Parsing Engine (APE).
    2% Copyright 2008-2013, Attempto Group, University of Zurich (see http://attempto.ifi.uzh.ch).
    3%
    4% The Attempto Parsing Engine (APE) is free software: you can redistribute it and/or modify it
    5% under the terms of the GNU Lesser General Public License as published by the Free Software
    6% Foundation, either version 3 of the License, or (at your option) any later version.
    7%
    8% The Attempto Parsing Engine (APE) is distributed in the hope that it will be useful, but WITHOUT
    9% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
   10% PURPOSE. See the GNU Lesser General Public License for more details.
   11%
   12% You should have received a copy of the GNU Lesser General Public License along with the Attempto
   13% Parsing Engine (APE). If not, see http://www.gnu.org/licenses/.
   14
   15
   16:- module(tokens_to_sentences, [
   17		tokens_to_sentences/2,
   18		tokens_to_paragraphs/2
   19	]).   20
   21
   22:- use_module('../lexicon/chars', [
   23		is_sentence_end_symbol/1
   24	]).

APE Sentence splitter

Converts a flat list of tokens into a list of sentences, each of which is a list of tokens. Sentences end with one of the three symbols: '.', '?', and '!'.

For example, the following list of tokens:

[John, likes, Mary, ., Every, man, owns, a, car, .]

is converted into the following list of sentences

[[John, likes, Mary, .], [Every, man, owns, a, car, .]]

author: - Kaarel Kaljurand; - Tobias Kuhn
version: - 2009-05-21
bug: - should we generate error messages here (if token list does not end with ./?/!)
*/

tokens_to_sentences(+Tokens:list, -Sentences:list) is semidet

Succeeds if Tokens is a list of ACE sentences, in this case the sentences are returned. To succeed the token list must either be empty or end with a sentence end symbol. E.g. the following token list would cause a failure:

[a, b, c]

Arguments:

`Tokens`	- is a list of ACE tokens
`Sentences`	- is a list of sentences (where a sentence is a list of ACE tokens)

   68tokens_to_sentences([], []).
   69
   70tokens_to_sentences(Tokens, [[^|Sentence]|Sentences]) :-
   71	first_sentence(Tokens, Sentence, RestTokens),
   72	tokens_to_sentences(RestTokens, Sentences).

first_sentence(+Tokens:list, -Sentence:list, -RestTokens:list) is det

Note that the token list can contain variables (added by the guesser to match prefixed in the parser). Thus we have to make sure that a variable is not "mistaken" for a sentence end symbol.

Arguments:

`Tokens`	- is a list of ACE tokens
`Sentence`	- is an ACE sentence (a list of ACE tokens)
`RestTokens`	- is a list of ACE tokens

   85first_sentence([SentenceEndSymbol | RestTokens], [SentenceEndSymbol], RestTokens) :-
   86	nonvar(SentenceEndSymbol),
   87	is_sentence_end_symbol(SentenceEndSymbol),
   88	!.
   89
   90first_sentence([Token | RestTokens], RestSentence, RestTokens2) :-
   91	Token == '<p>',
   92	!,
   93	first_sentence(RestTokens, RestSentence, RestTokens2).
   94
   95first_sentence([Token | RestTokens], [Token | RestSentence], RestTokens2) :-
   96	first_sentence(RestTokens, RestSentence, RestTokens2).

tokens_to_paragraphs(+Tokens, -Paragraphs)

  101tokens_to_paragraphs([], []).
  102
  103tokens_to_paragraphs(Tokens, [Paragraph|Paragraphs]) :-
  104	first_paragraph(Tokens, Paragraph, RestTokens),
  105	tokens_to_paragraphs(RestTokens, Paragraphs).

first_paragraph(+Tokens, -Paragraph, -RestTokens)

  110first_paragraph([Token | RestTokens], [], RestTokens) :-
  111	Token == '<p>',
  112	!.
  113
  114first_paragraph([Token | RestTokens], [Token | RestParagraph], RestTokens2) :-
  115	first_paragraph(RestTokens, RestParagraph, RestTokens2).
  116
  117first_paragraph([], [], [])