:- set_prolog_flag(stack_limit, 10 000 000 000).

:- use_module(library(csv)).        % csv_read_file/2.
:- use_module(library(process)).    % process_create/3.
:- use_module(library(readutil)).   % read_line_to_codes/2.

% if library(lib) is missing, install via pack_install(lib).
%
:- use_module(library(lib)).

% external code, lib knowns how to deal with these (will install if missing)
:- lib(os_lib).
:- lib(by_unix).
:- lib(debug_call).
:- lib(stoics_lib:prefix_atom/2).

% also sets lib alias to that dir
:- ensure_loaded('../../lib/bio_db_build_aliases').  % /1.

% load necessary data that has already been generated
% :- ensure_loaded(hgnc:bio_db_build_downloads('hgnc/maps/map_hgnc_symb_hgnc')).

% local libs & sources
:- lib(de_semi/3).
:- lib(csv_ids_map/6).
:- lib(link_to_bio_sub/3).
:- lib(bio_db_dnt_times/3).
:- lib(build_dnload_loc/3).
:- lib(bio_db_source_url/3).
:- lib(ncbi_species_grep/3).
:- lib(ens_fa_peptide_gene_rows/2).     % /2, fixme: should be more local
:- lib(url_file_local_date_mirror/3).

std_pig_maps_ncbi_defaults( Defs ) :-
                               Defs = [ db(ncbi),
                                        debug(true),
                                        debug_fetch(true),
                                        debug_url(false),
                                        iactive(true),
                                        ncbi_genes_file('gene2ensembl.gz'),
                                        org(pig)
                               ].

/** std_pig_maps_ncbi(+Opts).

Download latest NCBI gene to ensembl map file and convert it to 
a few standard maps.

Opts
  * db(Db)
    source database
  * debug(Dbg=true)
    informational, progress messages
  * debug_fetch(Fbg=true)
    whether to debug the fetching of the url (via url_file_local_date_mirror/3)
  * debug_url(Ubg=false)
    whether to debug the concatenation of the url (via bio_db_source_url/3)
  * iactive(Iact=true)
    whether the session is interactive, otherwise wget gets --no-verbose
  * ncbi_genes_file(GnsF='')
    the url base for the genes download
  * org(Org=human)
    organism

==
?- std_maps_ncbi([]).
==

@author nicos angelopoulos
@version  0.1 2014/7/23
@version  0.2 2022/12/26, entz-> ncbi, url via wget, csv without R
@version  0.3 2023/9/30,  new style opts and helpers
@tbd  can we implement for pig maps_ncbi_rnuc_symb/3, maps_ncbi_ensp_ensg/0 and maps_ncbi_ncbi_gont/0 from human ?

*/
std_pig_maps_ncbi( Args ) :-
     Self = std_pig_maps_ncbi,
     options_append( Self, Args, Opts ),
     bio_db_build_aliases( Opts ),
     build_dnload_loc( Self, DnDir, Opts ),
     bio_db_source_url( Url, [ncbi_genes_file-url_file,debug_url-debug], Opts ),
     options( debug_fetch(Fbg), Opts ),
     url_file_local_date_mirror( Url, DnDir, [debug(Fbg),dnld_file(GnsF)|Opts] ),
     working_directory( Old, DnDir ),
     MapsD = maps,
     make_directory_path( MapsD ),
     directory_file_path( MapsD, GnsF, ToP ),
     copy_file( GnsF, ToP ),
     bio_db_dnt_times( GnsF, DnDt, _DnEn ),
     working_directory( _ParentD, MapsD ),
     @ gunzip( -k, -f, GnsF ),
     file_name_extension( RemS, gz, GnsF ),
     ncbi_species_grep( RemS, PigG2NF, Opts ),
     debuc( Self, 'Grepped pig gene2ensembl into: ~p', [PigG2NF] ),
     std_pig_maps_ncbi( Self, PigG2NF, Url, DnDt, Opts ),
     delete_file( RemS ),
     %pig?: maps_ncbi_rnuc_symb( Self ),
     % maps_ncbi_unig_ncbi,  % unigene is no longer maintained as of Feb.2019
     working_directory( _, Old ).

std_pig_maps_ncbi( Self, PigF, Url, DnDt, Opts ) :-
     TsvOpts = [match_arity(false),separator(0'\t)],
     csv_read_file( PigF, Mtx, TsvOpts ),
     Mtx = [_Comment|Rows],
     Pig = [row(tax_id,ncbi,ensg,nucl_acc,ensr,prot_acc,ensp)|Rows],
     % GEnsGF = entrez_gene_id_ensg.pl,
     % csv_filter_by_column( New, tax_id, =(9606), HS ),
     % mtx_column_values_select( New, tax_id, 9823, Pig, _, true ),
     debuc( Self, length, hs_len/Pig ),
     Lens = [to_value_1(pos_integer),to_value_2(pfx_by('ENS')),datetime(DnDt),source(Url)|Opts],
     Rens = [to_value_2(pos_integer),to_value_1(pfx_by('ENS')),datetime(DnDt),source(Url)|Opts],
     csv_ids_map( PigF, ncbi, ensg, Pig, GEnsGF, [header(row('Entrez ID','Ensembl Gene'))|Lens] ),
     csv_ids_map( PigF, ensg, ncbi, Pig, EnsGGF, [header(row('Ensembl Gene','Entrez ID'))|Rens] ),
     % need to ensure prots are of ENSP  there are - in some entries
     Lenp = [to_value_1(pos_integer),to_value_2(pfx_by_de_v('ENS')),datetime(DnDt),source(Url)|Opts],
     csv_ids_map( PigF, ncbi, ensp, Pig, GEnsPF, [header(row('Entrez ID','Ensembl Protein'))|Lenp] ),
     Renp = [to_value_2(pos_integer),to_value_1(pfx_by_de_v('ENS')),datetime(DnDt),source(Url)|Opts],
     csv_ids_map( PigF, ensp, ncbi, Pig, EnsPGF, [header(row('Ensembl Protein','Entrez ID'))|Renp] ),
     Nens = [to_value_1(pos_integer),datetime(DnDt),source(Url)|Opts],
     csv_ids_map( PigF, ncbi, 'Symbol', Pig, NcbiSymbF, [header(row(ncbi,symbol))|Nens] ),
     link_to_bio_sub( ncbi, [GEnsGF,EnsGGF,GEnsPF,EnsPGF,NcbiSymbF], [type(maps)|Opts] ).

pos_integer( Numb, Numb ) :-
     integer( Numb ),
     !,
     Numb > 0.
pos_integer( Atom, Numb ) :-
     atom_number( Atom, Numb ),
     !,
     integer( Numb ), 
     Numb > 0.

pfx_by_de_v( Pfx, Full, UnV ) :-
     prefix_atom( Pfx, Full ),
    ( atomic_list_concat([UnV,_],'.',Full) ->
        true
        ;
        UnV = Full
    ).

pfx_by( Pfx, Full, Full ) :-
     prefix_atom( Pfx, Full ).