#!/usr/bin/perl 
#-w
# Script to generate Dods aggregation xml entries from files
#
# This routine actually should work on section and regular grid files..
#

##Changelog
# v.4      introduced subroutines and templates (bjorges)
# v.5      use to make XML according to Dataset Inventory Catalog
#          Specification 1.0.1 - only works on THREDDS 3.4 or newer!
#          (bjorges)
# 20060127 changed inputscalar service to service_type
#          changed inputscalar servicebase to datasetRootLocation
#          added inputscalars service_name, service_base, datasetRootPath
# 20060207 s/JoinNew/joinNew/ <- typo
# 20060208 bugfix (servicebase)
# 20061025 fixed template handling, added inputscalar aggvars
# 20070330 commented out <dimension name="time" length="0" /> when upgrading
#          THREDDS from 3.6 to 3.14.
# 20070418 Added things from a new metadata module
# 20080521 KAL - parse_dir seems to fail on some machines - replaced
# 20080521 KAL - Added security check for template: only works when mersea_cat is a directory
# 20090105 KAL - Changed from joinNew to JoinExisting aggregation
# 20090213 GZ -  This joinNew ( used & works with class 2 & class 3 because the files does not have time as dim. & coord. var.)
##

# Really should try harder to use these..
#use warnings;
#use strict;

## BEGIN MODULE IMPORT

use lib qw(/Data/topaz/Perl/lib/perl5/
           /Data/topaz/Perl/lib/perl5/site_perl
           /Data/topaz/Knut/thredds-config/MetadataXML);
#use lib qw(/export/home/nersc/Perl/lib/perl5
#	   /export/home/nersc/Perl/lib/perl5/site_perl
#	   /Users/knutal/Testing/XML-gen/MetadataXML);
#use Date::Calc qw(Delta_Days Add_Delta_Days);
use Data::Dumper;
use List::Util qw(min max);
use XML::Writer;
use IO::File;
use File::Glob ':glob';
use Getopt::Long;
#use DateTime;
use Time::Local;
require 'metadataGen.pl';

## END MODULE IMPORT

my @inputscalars = qw(mersea_cat
		      dataseturlpath
		      catalogname
		      topdatasetname
		      datasetname
		      datasetRootLocation
		      datasetRootPath
		      service_name
		      service_base
		      service_type
		      xmlfile
		      aggvars
		      geospatialCoverage); # New attribute - required
my @optinputscalars = qw(bulletindates
			 authority
			 dataType
			 dataFormat
			 doc_rights
			 doc_simple_href
			 doc_simple_title
			 doc_summary
			 creator_name
			 creator_contact_url
			 creator_contact_email
			 publisher_name
			 publisher_contact_url
			 publisher_contact_email
			 timeCoverage_end
			 timeCoverage_duration
			 timeCoverage_start); # these are the ones we use, there are more

# my ($headerflag,$footerflag,$appendflag,$template,$pipe);
# my ($mersea_cat,
#     $dataseturlpath,
#     $catalogname,
#     $topdatasetname,
#     $datasetname,
#     $datasetRootLocation,
#     $datasetRootPath,
#     $service_name,
#     $service_base,
#     $service_type,
#     $xmlfile);
# my ($bulletindates,
#     $authority,
#     $dataType,
#     $dataFormat,
#     $doc_rights,
#     $doc_simple_href,
#     $doc_simple_title,
#     $doc_summary,
#     $creator_name,
#     $creator_contact_url,
#     $creator_contact_email,
#     $publisher_name,
#     $publisher_contact_url,
#     $publisher_contact_email,
#     $timeCoverage_end,
#     $timeCoverage_duration,
#     $timeCoverage_start);
# my @ncvars;
# my $writer;
# my $fh;
# my $ncfile;
# my @lines;
# my $output;
# my $bd;
# my %bulletins;
# my ($forc_year,$forc_dom,$forc_month);
# my @bulletindates;
# my $element;
# my @befiles;
# my ($be,$prev,$abd,$i);
# my %forecast;
# my @files;
# my ($tmp,$var);
# my @listfiles;

## BEGIN SUBROUTINES

sub usage {

    print "Usage:\n\t$0 [OPTIONS] configfile[.input.pl]\n".
	"\t$0 [OPTIONS] - < configfile[.conf.pl]\n".
	"\tcat configfile | $0 [OPTIONS] -\n".
	"\n\tThe configfile must have all the scalars defined like this:\n".
	"\t\$scalar=\"value\";\n";
    print "\n\tOPTIONS:\n".
	"\t-h[eaderoutput]\t\t(default)Include the XML-headers\n\n".
	"\t-noh[eaderoutput]\tDo not include the XML-headers\n\n".
	"\t-f[ooteroutput]\t\t(default)Include the XML-footers\n\n".
	"\t-nof[ooteroutput]\tDo not include the XML-footers\n\n".
	"\t-a[ppend]\t\tAppend to file as opposed to overwrite\n\n".
	"\t-noa[ppend]\t\t(default)Overwrite file as opposed to appending\n\n".
	"\t-t[emplate]\t\tThe configfile will be interpreted as a templatefile\n".
	"\t\t\t\twith possible ending .template.pl.\n\n".
	"\t-\t\t\tRead from standard input\n";
    print "\n\tSCALARS - these scalars has to be defined in the configfile,\n";
    foreach ( @inputscalars ) {

	print "\t\t  \$$_ = \"VALUE(S)\";\n";
    }
    print "\n\tSCALARS (optional) - these scalars may be defined in the configfile,\n";
    foreach ( @optinputscalars ) {

	print "\t\t  \$$_ = \"VALUE(S)\";\n";
    }
}

sub init {

    $headerflag = 1;
    $footerflag = 1;

    GetOptions("headeroutput!"=>\$headerflag,
	       "footeroutput!"=>\$footerflag,
	       "append!"=>\$appendflag,
	       "template"=>\$template,
	       ""=>\$pipe);
    #print "template=$template\n";
}

sub retrievefile {

    # takes one parameter, the prefix to or the filename
    my $filename = shift;

    if ( -e $filename ) {

    }elsif ( -e $filename.".input.pl" && !$template ) {

	$filename .= ".input.pl";
    }elsif ( -e $filename.".template.pl" && $template ) {

	$filename .= ".template.pl";
    }else{
	
	print "File not found: $filename\n\n";
	&usage();
	die "\nFile not found.\n";
    }
    $fh = new IO::File $filename, "r";

    @lines = <$fh>;
    @lines = &remCommentsAndWS( @lines );

    $fh->close();
    return @lines;
}

sub remCommentsAndWS {

    my @input = @_;
    my @lines = ();
    foreach ( @input ) {
	
	s/#.*//;            # ignore comments by erasing them
	# NB!NB! also removes any $#array
	next if /^\s*$/;    # skip blank lines
	chomp;              # remove trailing newline characters    
	push @lines, $_;    # push the data line onto the array
    }
    @lines = &removewhite(@lines);
    return @lines;
}

sub assignscalars {

    my $vars = join("|",@inputscalars)."|".join("|",@optinputscalars);
    foreach ( @_ ) {
	
	if ( $_ =~ /^\s*\$($vars)\s*=\s*\"(.*)\"/ ) {
	    
	    $$1 = "$2";
	}
    }
}

sub checkscalars {

    foreach ( @inputscalars ) {
	if ( !defined($$_) ) {
	    
	    &usage();
	    print "\nYour file must define all neccesary variables/scalars:\n\n";
	    foreach ( @inputscalars ) {
		
		print "\t$_ is set to $$_\n";
	    }
	    foreach ( @optinputscalars ) {
		
		print "\t(optional) $_ is set to $$_\n";
	    }
	    die "Exiting!";
	}
    }
}

sub removewhite {

    my @words = @_;
    foreach (@words) {

	$_ =~ s/^\s+//; #remove leading whitespace
	$_ =~ s/\s+$//; #remove trailing whitespace
    }
    return @words;
}

sub writeheader {
    #KAL -- Added metadata retrieval subroutine, present in the metadata package
	require 'metadataGen.pl';

	#KAL -- Get "Expiry date for data set
	$expiry_date=metadata::getexpiry($mersea_cat);

    $writer->xmlDecl("UTF-8");
    $writer->startTag('catalog',
		      'name'         => "$catalogname",
		      'version'      => '1.0.1',
		      'xmlns'        => 'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0',
		      'xmlns:xlink'  => 'http://www.w3.org/1999/xlink',
		      'xmlns:xsi'    => 'http://www.w3.org/2001/XMLSchema-instance',
		      'xsi:schemaLocation'=>'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0  http://www.unidata.ucar.edu/schemas/thredds/InvCatalog.1.0.xsd',  
		      'expires'      => "$expiry_date");
    $writer->startTag('dataset', 
		      'name'         => "$topdatasetname");
    $writer->emptyTag('service', 
		      'name'         => 'this',
		      'serviceType'  => 'OpenDAP',
		      'base'         => '');
    $writer->startTag('service', 
		      'name'         => "$service_name",
		      'serviceType'  => "$service_type", #must be OPENDAP
		      'base'         => "$service_base");
    $writer->emptyTag('datasetRoot',
		      'path'         => "$datasetRootPath",
		      'location'     => "$datasetRootLocation");
    $writer->endTag('service');
    $writer->startTag('metadata',
		      'inherited'    => 'true');

	#KAL -- Add aggregation level property tag for mersea - this depends on  (for now) the dataseturlpath
	&metadata::agglevelproperty('view','no',$writer,$dataseturlpath);

    $writer->startTag('serviceName');
    $writer->characters('this');
    $writer->endTag('serviceName');
    if ( defined($authority) ) {
	$writer->startTag('authority');
	$writer->characters($authority);
	$writer->endTag('authority');
    }
    if ( defined($dataType) ) {
	$writer->startTag('dataType');
	$writer->characters($dataType);
	$writer->endTag('dataType');
    }
    if ( defined($dataFormat) ) {
	$writer->startTag('dataFormat');
	$writer->characters($dataFormat);
	$writer->endTag('dataFormat');
    }
    if ( defined($doc_rights) ) {
	$writer->startTag('documentation',
			  'type'      => 'rights');
	$writer->characters($doc_rights);
	$writer->endTag('documentation');
    }
    if ( defined($doc_simple_href) && defined($doc_simple_title) ) {
	$writer->emptyTag('documentation',
			  'xlink:href' => $doc_simple_href,
			  'xlink:title'=> $doc_simple_title);
    }
    if ( defined($doc_summary) ) {
	$writer->startTag('documentation',
			  'type' => 'summary');
	$writer->characters($doc_summary);
	$writer->endTag('documentation');
    }
    if ( defined($creator_name) || defined($creator_contact_url) || defined($creator_contact_email) ) {
	$writer->startTag('creator');
	if ( defined($creator_name) ) {
	    $writer->startTag('name',
			      'vocabulary' => 'DIF');
	    $writer->characters($creator_name);
	    $writer->endTag('name');
	}
	if ( defined($creator_contact_url) && defined($creator_contact_email) ) {
	    $writer->emptyTag('contact',
			      'url' => $creator_contact_url,
			      'email' => $creator_contact_email);
	}else{
	    $writer->emptyTag('contact',
			      'email' => $creator_contact_email) if defined($creator_contact_email);
	    $writer->emptyTag('contact',
			      'url' => $creator_contact_url) if defined($creator_contact_url);
	}
	$writer->endTag('creator');
    }
    if ( defined($publisher_name) || defined($publisher_contact_url) || defined($publisher_contact_email) ) {
	$writer->startTag('publisher');
	if ( defined($publisher_name) ) {
	    $writer->startTag('name',
			      'vocabulary' => 'DIF');
	    $writer->characters($publisher_name);
	    $writer->endTag('name');
	}
	if ( defined($publisher_contact_url) && defined($publisher_contact_email) ) {
	    $writer->emptyTag('contact',
			      'url' => $publisher_contact_url,
			      'email' => $publisher_contact_email);
	}else{
	    $writer->emptyTag('contact',
			      'email' => $publisher_contact_email) if defined($publisher_contact_email);
	    $writer->emptyTag('contact',
			      'url' => $publisher_contact_url) if defined($publisher_contact_url);
	}
	$writer->endTag('publisher');
    }


	#KAL - this is done by a different subroutine now 
	&metadata::main_metadatagen($mersea_cat,'no',$writer,$geospatialCoverage);

    $writer->endTag('metadata');
}

sub writefooter {

    $writer->endTag('dataset');
    $writer->endTag('catalog');
    $writer->end();
}

sub initiatedataset {

    my ( $name , $ID , @dates ) =@_;

    # 2 parameters; string datasetname, string dataseturlpath
    die "Internal error: initiatedataset missing input" if !defined($_[1]);
    $writer->startTag('dataset', 
		      'name'         => "$_[0]",
		      'ID'           => "$_[1]",
		      'urlPath'      => "$datasetRootPath/$_[1]");
    $writer->startTag('metadata',
	                  'inherited' => 'true');
	&metadata::agglevelproperty('view','no',$writer,$dataseturlpath);
    &metadata::metadata_timeCoverage($writer,'no',@dates);
    $writer->endTag('metadata');
    $writer->startTag('serviceName');
    $writer->characters('TOPAZ');
    $writer->endTag('serviceName');
    $writer->startTag('netcdf',
		      'xmlns'        => 'http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2');
#     $writer->emptyTag('dimension',
# 		      'name'         => 'time',
# 		      'length'       => '0');
    $writer->startTag('variable',
		      'name'         => 'time',
		      'type'         => 'int',
		      'shape'        => 'time');
    $writer->emptyTag('attribute',
		      'name'         => 'units',
		      'value'        => 'secs since 1970-01-01 00:00:00');
    $writer->emptyTag('attribute',
		      'name'         => '_CoordinateAxisType',
		      'value'        => 'time');
    $writer->endTag('variable');
    $writer->startTag('aggregation', 
		      'dimName'      => 'time',
		      'type'         => 'joinNew');
#    $writer->startTag('aggregation', 
#		      'dimName'      => 'time',
#		      'type'         => 'joinExisting');
    #'recheckEvery' => '1 days'
    #'dateFormat'   => 'yyyy/M/d:hh:mm:ss z' );

    # insert variableAgg
    #$ncfile = `ls -1 $mersea_cat | head -1`;
    #$ncfile = glob( $mersea_cat );
    #chomp $ncfile;
    #@ncvars = `ncdump -h $ncfile | grep -v : | grep short | cut -c8- | sed 's/(.*//'`;
    @ncvars = (split " ", $aggvars);
    foreach (@ncvars) {
	chomp;
	$writer->emptyTag('variableAgg',
			  'name'         => "$_");
    }
}

sub enddataset {

    $writer->endTag('aggregation');
    $writer->endTag('netcdf');
    $writer->endTag('dataset');
}

sub finddate {
    # takes one parameter, a filename
    my $input = shift;
    die "Internal error: finddate missing input" if !$input;

    my $forc_date = $input;
    $forc_date =~ s/^.*_f//;
    $forc_date =~ s/9999\..*$//;
    
    die "Filename not containing a date\n" if length($forc_date) != 8 || $forc_date !~ /^\d+$/; # 8 digits

    my $forc_year  = substr($forc_date,0,4);
    my $forc_month = substr($forc_date,4,2);
    my $forc_dom   = substr($forc_date,6,2);
    
    return ($forc_year,$forc_month,$forc_dom);
}

sub findbdate { # Returns bulletin date
    # takes one parameter, a filename
    my $input = shift;
    die "Internal error: findbdate missing input" if !$input;

    my $bull_date = $input;
    $bull_date =~ s/^.*_b//;
    $bull_date =~ s/_f.*$//;
    #print "bull date: $bull_date \n";
    
    die "Filename not containing a date\n" if length($bull_date) != 8 || $bull_date !~ /^\d+$/; # 8 digits

    my $bull_year  = substr($bull_date,0,4);
    my $bull_month = substr($bull_date,4,2);
    my $bull_dom   = substr($bull_date,6,2);
    
    return ($bull_year,$bull_month,$bull_dom);
}




sub template {
    use File::Listing;
    use File::Basename;

    #KAL - if template, mersea_cat must be directory
	-d $mersea_cat or die "For templates mersea_cat must be a directory";

    $output = new IO::File "$xmlfile", "w"; # start new clean file
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output);
    &writeheader();
    $output->close;
#    for (parse_dir(`ls -l $mersea_cat`)) {
#	(my $dname, my $type) = @$_;
#	next if $type ne 'd'; # directory
#       print "$dname \n";
    for (glob "$mersea_cat" . "/*") {
       $_=basename $_;
	$dname = $_;
	next if not -d "$mersea_cat/$_" ; # skip if not directory

	my $recfh = new IO::File "| $0 -nof -noh -a -" or die "Error in running recursively";
	foreach ( @inputscalars ) {
	    
	    if ( $_ eq 'mersea_cat' ) {
		$var = "$$_/$dname/*.nc";
	    }elsif ($_ eq 'dataseturlpath' ) {
		$var = "$$_-$dname";
	    }elsif ($_ eq 'datasetname' ) {
		$var = "$$_ $dname";
	    }elsif ($_ eq 'datasetRootLocation' ) {
		$var = "$datasetRootLocation$dname/"; #assumes we use append in recurs!
	    }else {
		$var = $$_;
	    }
	    $recfh->print("\$$_ = \"$var\";");
	}
	foreach ( @optinputscalars ) {
	    
	    $recfh->print("\$$_ = \"$$_\";");
	}
	$recfh->close;
    }
    $output = new IO::File "$xmlfile", "a"; # append footer
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1);
    &writefooter() if $footerflag;
    $output->close;

    print "Finished producing xml file: $xmlfile.\n" if $footerflag;
    exit;
}

sub checkprefix {

    (my $sb, my $file) = @_;
    my @prefix = split("::",$sb);
    if (  $appendflag && shift(@prefix) eq 'prefix' ) {

	return pop(@prefix)."/$file";
    }else{

	return $file;
    }
}

## End SUBROUTINES

## MAIN

&init();

## BEGIN READING INPUT

if ( $pipe ) {

    @lines = &remCommentsAndWS(<STDIN>);

}elsif ( @ARGV ) {
    
    @lines = &retrievefile(shift);
    print "So far $0 only takes one file at the time.\n" if shift;
}else { 
    
    &usage();
    die "Exiting!\n";
}

## END READING INPUT

@lines = split(";",join("",@lines)); # removes newlines

# Now @lines has only entries on the form "$scalar=EXPR"

## BEGIN INITIALIZATIONS

&assignscalars( @lines ); # does the assignments from the input[file]
&checkscalars(); # checks if all the neccesary scalars are present
&template() if $template;

if ( defined($bulletindates) ) {
    
    foreach ( split(",",$bulletindates) ) {

	if ( $_ =~ /^\s*be\s*$/ ) { 
	    
	    # has the user requested best estimates?
	    $be = 1; 
	}elsif ( $_ =~ /^\s*abd\s*$/ ) { 
	    
	    # has the user requested all bulletin dates?
	    $abd = 1; 
	}else { 

	    # add all requested bulletin dates
	    # Note: duplicates will be removed later
	    $_ = &removewhite($_);
	    push(@bulletindates,$_); 
	}
    }
}else{
    $be = 1;
    $abd = 1;
}

## END OF INITIALIZATIONS 

# find files
@listfiles=bsd_glob("$mersea_cat");
if ( $#listfiles < 0 )  {

    print "No files found in $mersea_cat \n";
    die "I quit \n";
}

foreach (@listfiles) {

    $tmp=$_;
    $tmp=~s/.*\///; # remove path
    push @files,$tmp ;
}

# 1) Get bulletin and forecast dates 
for ($i=0; $i<=$#files; $i++) {
#    #Bulletin
#    my $tmp=$files[$i];
#    $tmp=~s/^.*_b//; # remove everything before bulletindate
#    $tmp=~s/_f.*$//; # remove everything after bulletindate
#    
#    #Forecast
#    my $tmp2=$files[$i]; 
#    $tmp2=~s/^.*_f//; # remove everything before forecastdate
#    $tmp2=~s/9999\..*$//; # remove everything after forecastdate
    my $tmp  = join "" , findbdate($files[$i]) ;
    my $tmp2 = join "" , finddate ($files[$i]) ;

    
    # Create a "forecast" hash element (hash of hashes)
    # Each forecast date has an associated bulletin date (tmp) which points to
    # the filename
    if ( $be ) {
	$forecast{$tmp2}->{$tmp}=$files[$i];
    }

    # Create a "bulletin" hash element (hash of hashes)
    # Each bulletin date has an associated forecast date (tmp2) which points to
    # the filename
    if ( $#bulletindates >= 0) {
	$bulletins{$tmp}->{$tmp2}=$files[$i];
    }

    # If we are supposed to list all bulletindates, add all to the list
    if ( $abd ) {
	push(@bulletindates,$tmp); 
    }
}


## START SORTING

# Loop through hashes and find the hash with largest bulletin date 
# (this is the "best estimate")
for my $key1 (keys %forecast) {
    # Push new file into file array
    push @befiles,$forecast{$key1}->{max(keys %{ $forecast{$key1}})};
}
# Sort best estimate files
@befiles = sort @befiles;

# sort bulletin daterefs
@bulletindates = sort @bulletindates;
# and remove duplicates
$prev = 'nonesuch';
@bulletindates = grep($_ ne $prev && (($prev) = $_), @bulletindates);

## END SORTING

## START WRITING

# Write to XML file
if ( $appendflag ) {

    $output = new IO::File "$xmlfile", "a";
    $output->print("\n");
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1);
}else{
    
    $output = new IO::File "$xmlfile", "w";
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE =>1 , OUTPUT => $output);
}

&writeheader() if $headerflag;

if ( $be ) {
    
    &initiatedataset("Best estimate - $datasetname","$dataseturlpath-be",@befiles);

    # Go through each file 
    foreach $element ( @befiles ) {
	
	# find forecastdate
	($forc_year,$forc_month,$forc_dom) = finddate( $element );

	#my $up = &checkprefix($datasetRootLocation,$element);
	#chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`);
          #my $dt = DateTime->new( year => $forc_year, month  => $forc_month, day  => $forc_dom , 
          #                     time_zone => 'UTC'  ) ;
          #my $secs=$dt->epoch();
          my $secs = timelocal(0,0,0,$forc_dom,$forc_month-1,$forc_year); # Month in range 0..11


	$writer->emptyTag('netcdf', 
			  'location' => "$datasetRootLocation$element",
			  'coordValue'    => "$secs");
    }
    &enddataset();
}

if ( @bulletindates ) {

    # go trough each requested bulletin date
    foreach $bd ( @bulletindates ) {

    #KAL -- Before we initiate the dataset, we must know stop and end dates
	#my @flist = sort keys % { $bulletins{$bd} } ; 
	#print @flist;
	#
	&initiatedataset("Bulletin $bd - $datasetname", "$dataseturlpath-$bd", sort keys % { $bulletins{$bd}  });


	# Go through each file 
	foreach ( sort keys %{ $bulletins{$bd} } ) {


	    # find forecastdate
	    ($forc_year,$forc_month,$forc_dom) = finddate( $bulletins{$bd}->{$_} );
	    
	    #my $up = &checkprefix($datasetRootLocation,$bulletins{$bd}->{$_});
	    #chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`);
              #my $dt = DateTime->new( year => $forc_year, month  => $forc_month, day  => $forc_dom , 
              #                     time_zone => 'UTC'  ) ;
              #my $secs=$dt->epoch();
              my $secs = timelocal(0,0,0,$forc_dom,$forc_month-1,$forc_year); # Month in range 0..11

	    $writer->emptyTag('netcdf', 
			      'location'  => "$datasetRootLocation$bulletins{$bd}->{$_}",
			      'coordValue'     =>  "$secs" );
	    
	}
	&enddataset();
    }
}

&writefooter() if $footerflag;
$output->close();
print "Finished producing xml file: $xmlfile.\n" if $footerflag;
