% the new tokkie.pl, by Johan Bos

/* ========================================================================
   File Search Paths
======================================================================== */

file_search_path(semlib, 'src/prolog/lib').
file_search_path(boxer,  'src/prolog/boxer').


/* ========================================================================
   Dynamic Predicates
======================================================================== */

:- dynamic split/7, title/1.


/* ========================================================================
   Load other libraries
======================================================================== */

:- use_module(library(lists),[member/2,append/3,reverse/2]).
:- use_module(library(readutil),[read_stream_to_codes/2]).
:- use_module(semlib(abbreviations),[iAbb/2,tAbb/2]).
:- use_module(semlib(errors),[error/2,warning/2]).
:- use_module(semlib(options),[option/2,parseOptions/2,setOption/3,
                               showOptions/1,setDefaultOptions/1]).


/* ========================================================================
   Main
======================================================================== */

tokkie:-
   option(Option,do), 
   member(Option,['--help']), !, 
   help.

tokkie:-
   openInput(InStream),
   openOutput(OutStream), !,
   read_stream_to_codes(InStream,Codes),
   close(InStream),
   initTokkie,
   readLines(Codes,0,1,OutStream,Tokens),
   outputIOB(Codes,Tokens,OutStream),
   close(OutStream).

tokkie:-
   setOption(tokkie,'--help',do), !,
   help.


/* ----------------------------------------------------------------------
   Read lines
---------------------------------------------------------------------- */

readLines(Codes1,I1,S1,Stream,[Tokens|L]):-
   begSent(Codes1,I1,Codes2,I2), !,       % determine begin of a new sentence
   endSent(Codes2,I2,Codes3,I3,Rest,[]),  % determine end of this sentence  
%  format(Stream,'sen(~p,~p,~s).~n',[I2,I3,Codes3]),
%  write(Codes3),nl,
   tokenise(Codes3,I2,I2,T-T,Tokens),     % split sentence into tokens
   outputTokens(Tokens,S1,Stream),
   S2 is S1 + 1,                          % increase sentence counter
   readLines(Rest,I3,S2,Stream,L).        % process remaining of document

readLines(_,_,_,_,[]).


/* ----------------------------------------------------------------------
   Determine beginning of sentence
---------------------------------------------------------------------- */

begSent([Sep|C1],I1,C2,I3):- 
   sep(Sep), !,               % skip space, tab or newline
   I2 is I1 + 1,
   begSent(C1,I2,C2,I3).

begSent([C|L],I,[C|L],I).


/* ----------------------------------------------------------------------
   Determine end of sentence

   endSent(+CodesI,             % Input string
           +CurrentPosition,    % Current character position
           +CodesO,             % Output string (until sentence boundary)
           +BoundaryPosition,   % Character position of boundary
           +CodesR,             % Rest string
           +CodesLast)          % Last token

---------------------------------------------------------------------- */

endSent([],I,[],I,[],_):- !.

% Case 1: A full stop after a space 
%         --> sentence boundary.
endSent([46|Rest],I1,[46],I2,Rest,[]):- !, 
   I2 is I1 + 1.

% Case 2: full stop before a quote followed by a space
%         --> sentence boundary
endSent([46,Q1,Q2,X|Rest],I1,[46,Q1,Q2],I2,[X|Rest],_):- 
   \+ alphanum(X), quote(Q1), quote(Q2), !, I2 is I1 + 3.

endSent([46,Q,X|Rest],I1,[46,Q],I2,[X|Rest],_):- 
   \+ alphanum(X), quote(Q), !, I2 is I1 + 2.

% Case 3: full stop, but no sentence boundary
% 
endSent([C|C1],I1,[C|C2],I3,Rest,Last):- 
   noSentenceBoundary([C],C1,Last), !,
   I2 is I1 + 1,
   endSent(C1,I2,C2,I3,Rest,[C|Last]).

% Case 4: A full stop/question/exclemation mark after a non-abbreviation 
%         --> sentence boundary
endSent([End|Rest],I1,[End],I2,Rest,_):- 
   member(End,[46,63,33]), !, 
   I2 is I1 + 1.

endSent([46|Rest],I1,[46],I2,Rest,_):- !, 
   I2 is I1 + 1.

endSent([C|C1],I1,[C|C2],I3,Rest,Last):-
   alphanum(C), !,
   I2 is I1 + 1,
   endSent(C1,I2,C2,I3,Rest,[C|Last]).

endSent([C|C1],I1,[C|C2],I3,Rest,_):-
   I2 is I1 + 1,
   endSent(C1,I2,C2,I3,Rest,[]).


/* ----------------------------------------------------------------------
   Cases describing NO sentence boundaries

   noSentenceBoundary(Char,     % Character that could signal boundary
                      Next,     % Codes following
                      Last)     % Last token

---------------------------------------------------------------------- */
% Case 1: full stop after uppercase one-character token (i.e. initial)
noSentenceBoundary(".",_,Last):- Last = [Upper], upper(Upper).
% Case 2: full stop after a title 
noSentenceBoundary(".",_,Last):- title(Last).
% Case 2: full stop after an abbrev 
noSentenceBoundary(".",_,Last):- member(46,Last).
% Case 3: full stop before number
noSentenceBoundary(".",[N|_],_):- num(N).


/* ----------------------------------------------------------------------
   Split Line into Tokens
---------------------------------------------------------------------- */

% Nothing left to do, no tokens in queue
%
tokenise([],_,_,Sofar-[],[]):- Sofar=[], !.

% Nothing left to do, still a token present (input empty): store last token 
%
tokenise([],CurrentPos,StartPos,Sofar-[],[tok(StartPos,CurrentPos,Sofar)]):- !.

% Separator follows separator
%
tokenise([Sep|Codes],CurrentPos,_,T1-T2,Tokens):-
   sep(Sep), T2=[], T1=[], !,
   Pos is CurrentPos + 1, 
   tokenise(Codes,Pos,Pos,T-T,Tokens).

% Separator follows token
%
tokenise([Sep|Codes],CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
   sep(Sep), !, Tail = [],
   Token = tok(StartPos,CurrentPos,Sofar), 
   Pos is CurrentPos + 1, 
   tokenise(Codes,Pos,Pos,T-T,Tokens).

% Last character is a split, nothing in the queue: store last character
%
tokenise(Input,CurrentPos,_,Sofar-[],[Token|Tokens]):- 
   final(Input,Head,Rest,Len), Sofar = [], !,
   FinalPos is CurrentPos + Len,
   Token = tok(CurrentPos,FinalPos,Head),
   tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).

% Last character is a split, store item in the queue and last character
%
tokenise(Input,CurrentPos,StartPos,Sofar-[],[Token1,Token2|Tokens]):- 
   final(Input,Head,Rest,Len), !,
   FinalPos is CurrentPos + Len,
   Token1 = tok(StartPos,CurrentPos,Sofar),
   Token2 = tok(CurrentPos,FinalPos,Head),
   tokenise(Rest,FinalPos,FinalPos,T-T,Tokens).

% Do not perform a split
%
tokenise(Input,CurrentPos,StartPos,OldSofar,Tokens):-
   dontsplit(Input,Rest,Diff,OldSofar,NewSofar), !,
   Pos is CurrentPos + Diff, 
   tokenise(Rest,Pos,StartPos,NewSofar,Tokens).


% Perform a token split operation
%
tokenise(Input,CurrentPos,StartPos,Sofar-Tail,[Token|Tokens]):-
   trysplit(Input,Left,Right,Rest,LenLeft,LenRight), !,
%  format('Input: ~s~n',[Input]),
%  format('Left: ~s~n',[Left]),
%  format('Right: ~s~n',[Right]),
%  format('Rest: ~s~n',[Rest]),
   Pos is CurrentPos + LenLeft,
   NewPos is Pos + LenRight,
   Tail = Left,
   Token = tok(StartPos,Pos,Sofar),    
   append(Right,NewTail,New),
   tokenise(Rest,NewPos,Pos,New-NewTail,Tokens).

% Do nothing but collect new token
%
tokenise([X|Codes],CurrentPos,StartPos,Sofar-Tail,Tokens):-
   Pos is CurrentPos + 1, 
   Tail = [X|NewTail],
   tokenise(Codes,Pos,StartPos,Sofar-NewTail,Tokens).


/* ----------------------------------------------------------------------
   Output Tokens
---------------------------------------------------------------------- */

outputTokens(Tokens,S,Stream):-
   option('--mode',poor), !,
   printTokens(Tokens,S,1,Stream).

outputTokens(Tokens,S,Stream):-
   option('--mode',rich), !,
   printTokens(Tokens,S,1,Stream).

outputTokens(_,_,_).


/* ----------------------------------------------------------------------
   Wrapper IOB format
---------------------------------------------------------------------- */

outputIOB(Codes,Tokens,Stream):-
   option('--mode',iob), !,
   printIOB(Codes,0,Tokens,Stream).

outputIOB(_,_,_).


/* ----------------------------------------------------------------------
   Output IOB format
---------------------------------------------------------------------- */

printIOB([],_,_,_).

printIOB([X|L],N1,TokenSet,Stream):-
   member([tok(N1,_,Tok)|_],TokenSet), !, Tag = 'S',
   tupleIOB(N1,X,Tag,Tok,Stream),
   N2 is N1 + 1,
   printIOB(L,N2,TokenSet,Stream).

printIOB([X|L],N1,TokenSet,Stream):-
   member(Tokens,TokenSet),
   member(tok(N1,_,Tok),Tokens), !, Tag = 'T',
   tupleIOB(N1,X,Tag,Tok,Stream),
   N2 is N1 + 1,
   printIOB(L,N2,TokenSet,Stream).

printIOB([X|L],N1,TokenSet,Stream):-
   member(Tokens,TokenSet),
   member(tok(Start,End,_),Tokens), N1 > Start, N1 < End, !, Tag = 'I',
   tupleIOB(N1,X,Tag,[],Stream),
   N2 is N1 + 1,
   printIOB(L,N2,TokenSet,Stream).

printIOB([X|L],N1,TokenSet,Stream):-
   Tag = 'O',
   tupleIOB(N1,X,Tag,[],Stream),
   N2 is N1 + 1,
   printIOB(L,N2,TokenSet,Stream).


/* ----------------------------------------------------------------------
   Tuple IOB format
---------------------------------------------------------------------- */

tupleIOB(_,X,Tag,_,Stream):- 
   option('--format',txt), !,
   format(Stream,'~p ~p~n',[X,Tag]).

tupleIOB(N,X,Tag,Tok,Stream):- 
   option('--format',prolog), !,
   format(Stream,'tok(~p,\'~p\'). % ~p ~s~n',[X,Tag,N,Tok]).


/* ----------------------------------------------------------------------
   Print Tokens
---------------------------------------------------------------------- */

printTokens([],_,_,_). 

printTokens([tok(_,_,Tok)],_,_,Stream):- 
   option('--mode',poor), !,
   format(Stream,'~s~n',[Tok]). 

printTokens([tok(I,J,Tok)|L],S,T1,Stream):- 
   option('--format',prolog),
   option('--mode',rich), !,
   Index is S*1000+T1,
   format(Stream,'tok(~p, ~p, ~p, ~s).~n',[I,J,Index,Tok]), 
   T2 is T1+1,
   printTokens(L,S,T2,Stream).

printTokens([tok(I,J,Tok)|L],S,T1,Stream):- 
   option('--format',txt),
   option('--mode',rich), !,
   Index is S*1000+T1,
   format(Stream,'~p ~p ~p ~s~n',[I,J,Index,Tok]), 
   T2 is T1+1,
   printTokens(L,S,T2,Stream).

printTokens([tok(_,_,Tok)|L],S,T,Stream):- 
   option('--mode',poor), !,
   format(Stream,'~s ',[Tok]), 
   printTokens(L,S,T,Stream).


/* ----------------------------------------------------------------------
   Type checking
---------------------------------------------------------------------- */

sep(10).    % new line
sep(13).    % new line
sep(32).    % space
sep(9).     % tab
sep(160).   % nbsp (non-breaking space)
sep(8194).  % en space
sep(8195).  % em space

alphanum(X):- alpha(X), !.
alphanum(X):- num(X), !.

alpha(62):- !.                         %%% '>' (end of markup)
alpha(X):- upper(X), !.
alpha(X):- lower(X), !.

upper(X):- number(X), X > 64, X < 91, !.
upper(X):- var(X), member(X,"ABCDEFGHIJKLMNOPQRSTUVWXYZ").

lower(X):- number(X), X > 96, X < 123, !.
lower(X):- var(X), member(X,"abcdefghijklmnopqrstuvwxyz").

num(X):- number(X), X > 47, X < 58, !.
num(X):- var(X), member(X,"0123456789").


/* ----------------------------------------------------------------------
   Rules for splitting tokens
   split(+Left,+ConditionsOnLeft,+Right,+ConditionsOnRight,+Context)
---------------------------------------------------------------------- */

split("can",[], "not",[], []).
split([_],[], "n't",[], []).
split([_],[], "'ll",[], []).
split([_],[], "'ve",[], []).
split([_],[], "'re",[], []).

split([_],[], "'m",[], []).
split([_],[], "'d",[], []).
split([_],[], "'s",[], []).

split([N],[num(N)],   [], [], "%").
split("%",[],         ",",[],[]).
split(")",[],         ",",[],[]).

split([N],[num(N)],   ",",[], [32]).
split([N],[num(N)],   ",",[], [10]).
split([A],[alpha(A)], [], [], ",").
split([_],[],         ";",[], []).
split([_],[],         ":",[], []).
split([_],[],         [],[], ")").
%split([_],[],         ")",[], []).
split([_],[],         "]",[], []).

split("$",[],   [N],[num(N)], []).     % dollar
split([163],[], [N],[num(N)], []).     % pound
split([165],[], [N],[num(N)], []).     % yen
split("(",[],   [X],[alphanum(X)], []).
split("[",[],   [X],[alphanum(X)], []).

split([_],[],         [Q],[quote(Q)], []).
split([Q],[quote(Q)], [X],[alphanum(X)], []).


/* ----------------------------------------------------------------------
   Exceptions (do not split)
---------------------------------------------------------------------- */

dontsplit(Input,Rest,N,Old-OldTail,Old-NewTail):- 
   nosplit(Left,N),
   append(Left,Rest,Input), !,
   append(Left,NewTail,OldTail).

nosplit("hi'it",5).
nosplit("e.g.",4).
nosplit([79,Q,U],3):- rsq(Q), upper(U).   % Irish names


/* ----------------------------------------------------------------------
   Initialisation
---------------------------------------------------------------------- */

initTokkie:-  
   initTitles,
   initSplitRules.

initTitles:-
   option('--language',Language), !,
   findall(Title,
           ( tAbb(Language,Title),
             reverse(Title,Reversed),
             assertz(title(Reversed)) ),
           _).
          
initSplitRules:-
   findall(Ri,
          ( split(Le,CondLe,Ri,CondRi,Context),
            length(Le,LenLe),
            length(Ri,LenRi),
            assertz(split(Le,LenLe,CondLe,Ri,LenRi,CondRi,Context)) ),
          _).


/* ----------------------------------------------------------------------
   Rules for final tokens
---------------------------------------------------------------------- */

final("?", "?", [], 1).
final(".", ".", [], 1).

final([46,Q],[46], [Q],1):- quote(Q).


/* ----------------------------------------------------------------------
   Try a splitting rule on the input
---------------------------------------------------------------------- */

trysplit(Input,Left,Right,Rest,LenLeft,LenRight):-
   split(Left,LenLeft,CondsLeft,Right,LenRight,CondsRight,RightContext),
   append(Left,Middle,Input), 
   checkConds(CondsLeft),  
   append(Right,Rest,Middle), 
   checkConds(CondsRight),   
   append(RightContext,_,Rest), !.


/* ----------------------------------------------------------------------
   Check Conditions
---------------------------------------------------------------------- */

checkConds([]).
checkConds([C|L]):- call(C), !, checkConds(L).


/* ----------------------------------------------------------------------------------
   Codes for right single quotation marks (used in genitives)
---------------------------------------------------------------------------------- */

rsq(39).
rsq(8217).


/* ----------------------------------------------------------------------------------
   Codes for single-character quotes
---------------------------------------------------------------------------------- */

quote(34).    %%% "
quote(39).    %%% '
quote(96).    %%% `
quote(8216).  %%% left single quotation mark
quote(8217).  %%% right single quotation mark
quote(8218).  %%% low single quotation mark
quote(8220).  %%% left double quotation mark
quote(8221).  %%% right double quotation mark
quote(8222).  %%% low double quotation mark


/* ----------------------------------------------------------------------------------
   Codes for double quotes
---------------------------------------------------------------------------------- */

quotes(96).    %%% ``
quotes(39).    %%% ''
quotes(8216).
quotes(8217).
quotes(8218).


/* =======================================================================
   Open Input File
========================================================================*/

openInput(Stream):-
   option('--stdin',dont),
   option('--input',File),
   exists_file(File), !,
   open(File,read,Stream,[encoding(utf8)]).

openInput(Stream):-
   option('--stdin',do), 
   set_prolog_flag(encoding,utf8),
   warning('reading from standard input',[]),
   prompt(_,''),
   Stream = user_input.


/* =======================================================================
   Open Output File
========================================================================*/

openOutput(Stream):-
   option('--output',Output),
   atomic(Output),
   \+ Output=user_output,
   ( access_file(Output,write), !,
     open(Output,write,Stream,[encoding(utf8)])
   ; error('cannot write to specified file ~p',[Output]),
     Stream=user_output ), !.

openOutput(user_output).


/* =======================================================================
   Help
========================================================================*/

help:-
   option('--help',do), !,
   format(user_error,'usage: tokkie [options]~n~n',[]),
   showOptions(tokkie).

help:-
   option('--help',dont), !.


/* =======================================================================
   Definition of start
========================================================================*/

start:-
   current_prolog_flag(argv,[_Comm|Args]),
   setDefaultOptions(tokkie), 
   parseOptions(tokkie,Args),
   tokkie, !,
   halt.

start:- 
   error('tokkie failed',[]), 
   halt.