#!/usr/bin/perl 
#-w
# Script to generate Dods aggregation xml entries from files
#
# This routine actually should work on section and regular grid files..
#

##Changelog
# v.4      introduced subroutines and templates (bjorges)
# v.5      use to make XML according to Dataset Inventory Catalog
#          Specification 1.0.1 - only works on THREDDS 3.4 or newer!
#          (bjorges)
# 20060127 changed inputscalar service to service_type
#          changed inputscalar servicebase to datasetRootLocation
#          added inputscalars service_name, service_base, datasetRootPath
# 20060207 s/JoinNew/joinNew/ <- typo
# 20060208 bugfix (servicebase)
# 20061025 fixed template handling, added inputscalar aggvars
# 20070330 commented out <dimension name="time" length="0" /> when upgrading
#          THREDDS from 3.6 to 3.14.
##

# Really should try harder to use these..
#use warnings;
#use strict;

## BEGIN MODULE IMPORT

use lib qw(/export/home/nersc/Perl/lib/perl5
	   /export/home/nersc/Perl/lib/perl5/site_perl);
use Date::Calc qw(Delta_Days);
use Data::Dumper;
use List::Util qw(min max);
use XML::Writer;
use IO::File;
use File::Glob ':glob';
use Getopt::Long;

## END MODULE IMPORT

my @inputscalars = qw(mersea_cat
		      dataseturlpath
		      catalogname
		      topdatasetname
		      datasetname
		      datasetRootLocation
		      datasetRootPath
		      service_name
		      service_base
		      service_type
		      xmlfile
		      aggvars);
my @optinputscalars = qw(bulletindates
			 authority
			 dataType
			 dataFormat
			 doc_rights
			 doc_simple_href
			 doc_simple_title
			 doc_summary
			 creator_name
			 creator_contact_url
			 creator_contact_email
			 publisher_name
			 publisher_contact_url
			 publisher_contact_email
			 timeCoverage_end
			 timeCoverage_duration
			 timeCoverage_start); # these are the ones we use, there are more

# my ($headerflag,$footerflag,$appendflag,$template,$pipe);
# my ($mersea_cat,
#     $dataseturlpath,
#     $catalogname,
#     $topdatasetname,
#     $datasetname,
#     $datasetRootLocation,
#     $datasetRootPath,
#     $service_name,
#     $service_base,
#     $service_type,
#     $xmlfile);
# my ($bulletindates,
#     $authority,
#     $dataType,
#     $dataFormat,
#     $doc_rights,
#     $doc_simple_href,
#     $doc_simple_title,
#     $doc_summary,
#     $creator_name,
#     $creator_contact_url,
#     $creator_contact_email,
#     $publisher_name,
#     $publisher_contact_url,
#     $publisher_contact_email,
#     $timeCoverage_end,
#     $timeCoverage_duration,
#     $timeCoverage_start);
# my @ncvars;
# my $writer;
# my $fh;
# my $ncfile;
# my @lines;
# my $output;
# my $bd;
# my %bulletins;
# my ($forc_year,$forc_dom,$forc_month);
# my @bulletindates;
# my $element;
# my @befiles;
# my ($be,$prev,$abd,$i);
# my %forecast;
# my @files;
# my ($tmp,$var);
# my @listfiles;

## BEGIN SUBROUTINES

sub usage {

    print "Usage:\n\t$0 [OPTIONS] configfile[.input.pl]\n".
	"\t$0 [OPTIONS] - < configfile[.conf.pl]\n".
	"\tcat configfile | $0 [OPTIONS] -\n".
	"\n\tThe configfile must have all the scalars defined like this:\n".
	"\t\$scalar=\"value\";\n";
    print "\n\tOPTIONS:\n".
	"\t-h[eaderoutput]\t\t(default)Include the XML-headers\n\n".
	"\t-noh[eaderoutput]\tDo not include the XML-headers\n\n".
	"\t-f[ooteroutput]\t\t(default)Include the XML-footers\n\n".
	"\t-nof[ooteroutput]\tDo not include the XML-footers\n\n".
	"\t-a[ppend]\t\tAppend to file as opposed to overwrite\n\n".
	"\t-noa[ppend]\t\t(default)Overwrite file as opposed to appending\n\n".
	"\t-t[emplate]\t\tThe configfile will be interpreted as a templatefile\n".
	"\t\t\t\twith possible ending .template.pl.\n\n".
	"\t-\t\t\tRead from standard input\n";
    print "\n\tSCALARS - these scalars has to be defined in the configfile,\n";
    foreach ( @inputscalars ) {

	print "\t\t  \$$_ = \"VALUE(S)\";\n";
    }
    print "\n\tSCALARS (optional) - these scalars may be defined in the configfile,\n";
    foreach ( @optinputscalars ) {

	print "\t\t  \$$_ = \"VALUE(S)\";\n";
    }
}

sub init {

    $headerflag = 1;
    $footerflag = 1;

    GetOptions("headeroutput!"=>\$headerflag,
	       "footeroutput!"=>\$footerflag,
	       "append!"=>\$appendflag,
	       "template"=>\$template,
	       ""=>\$pipe);
    #print "template=$template\n";
}

sub retrievefile {

    # takes one parameter, the prefix to or the filename
    my $filename = shift;

    if ( -e $filename ) {

    }elsif ( -e $filename.".input.pl" && !$template ) {

	$filename .= ".input.pl";
    }elsif ( -e $filename.".template.pl" && $template ) {

	$filename .= ".template.pl";
    }else{
	
	print "File not found: $filename\n\n";
	&usage();
	die "\nFile not found.\n";
    }
    $fh = new IO::File $filename, "r";

    @lines = <$fh>;
    @lines = &remCommentsAndWS( @lines );

    $fh->close();
    return @lines;
}

sub remCommentsAndWS {

    my @input = @_;
    my @lines = ();
    foreach ( @input ) {
	
	s/#.*//;            # ignore comments by erasing them
	# NB!NB! also removes any $#array
	next if /^\s*$/;    # skip blank lines
	chomp;              # remove trailing newline characters    
	push @lines, $_;    # push the data line onto the array
    }
    @lines = &removewhite(@lines);
    return @lines;
}

sub assignscalars {

    my $vars = join("|",@inputscalars)."|".join("|",@optinputscalars);
    foreach ( @_ ) {
	
	if ( $_ =~ /^\s*\$($vars)\s*=\s*\"(.*)\"/ ) {
	    
	    $$1 = "$2";
	}
    }
}

sub checkscalars {

    foreach ( @inputscalars ) {
	if ( !defined($$_) ) {
	    
	    &usage();
	    print "\nYour file must define all neccesary variables/scalars:\n\n";
	    foreach ( @inputscalars ) {
		
		print "\t$_ is set to $$_\n";
	    }
	    foreach ( @optinputscalars ) {
		
		print "\t(optional) $_ is set to $$_\n";
	    }
	    die "Exiting!";
	}
    }
}

sub removewhite {

    my @words = @_;
    foreach (@words) {

	$_ =~ s/^\s+//; #remove leading whitespace
	$_ =~ s/\s+$//; #remove trailing whitespace
    }
    return @words;
}

sub writeheader {

    $writer->xmlDecl("UTF-8");
    $writer->startTag('catalog',
		      'name'         => "$catalogname",
		      'version'      => '1.0.1',
		      'xmlns'        => 'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0',
		      'xmlns:xlink'  => 'http://www.w3.org/1999/xlink',
		      'xmlns:xsi'    => 'http://www.w3.org/2001/XMLSchema-instance',
		      'xsi:schemaLocation'=>'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0  http://www.unidata.ucar.edu/schemas/thredds/InvCatalog.1.0.xsd') ;
    $writer->startTag('dataset', 
		      'name'         => "$topdatasetname");
    $writer->emptyTag('service', 
		      'name'         => 'this',
		      'serviceType'  => 'OpenDAP',
		      'base'         => '');
    $writer->startTag('service', 
		      'name'         => "$service_name",
		      'serviceType'  => "$service_type", #must be OPENDAP
		      'base'         => "$service_base");
    $writer->emptyTag('datasetRoot',
		      'path'         => "$datasetRootPath",
		      'location'     => "$datasetRootLocation");
    $writer->endTag('service');
    $writer->startTag('metadata',
		      'inherited'    => 'true');
    $writer->startTag('serviceName');
    $writer->characters('this');
    $writer->endTag('serviceName');
    if ( defined($authority) ) {
	$writer->startTag('authority');
	$writer->characters($authority);
	$writer->endTag('authority');
    }
    if ( defined($dataType) ) {
	$writer->startTag('dataType');
	$writer->characters($dataType);
	$writer->endTag('dataType');
    }
    if ( defined($dataFormat) ) {
	$writer->startTag('dataFormat');
	$writer->characters($dataFormat);
	$writer->endTag('dataFormat');
    }
    if ( defined($doc_rights) ) {
	$writer->startTag('documentation',
			  'type'      => 'rights');
	$writer->characters($doc_rights);
	$writer->endTag('documentation');
    }
    if ( defined($doc_simple_href) && defined($doc_simple_title) ) {
	$writer->emptyTag('documentation',
			  'xlink:href' => $doc_simple_href,
			  'xlink:title'=> $doc_simple_title);
    }
    if ( defined($doc_summary) ) {
	$writer->startTag('documentation',
			  'type' => 'summary');
	$writer->characters($doc_summary);
	$writer->endTag('documentation');
    }
    if ( defined($creator_name) || defined($creator_contact_url) || defined($creator_contact_email) ) {
	$writer->startTag('creator');
	if ( defined($creator_name) ) {
	    $writer->startTag('name',
			      'vocabulary' => 'DIF');
	    $writer->characters($creator_name);
	    $writer->endTag('name');
	}
	if ( defined($creator_contact_url) && defined($creator_contact_email) ) {
	    $writer->emptyTag('contact',
			      'url' => $creator_contact_url,
			      'email' => $creator_contact_email);
	}else{
	    $writer->emptyTag('contact',
			      'email' => $creator_contact_email) if defined($creator_contact_email);
	    $writer->emptyTag('contact',
			      'url' => $creator_contact_url) if defined($creator_contact_url);
	}
	$writer->endTag('creator');
    }
    if ( defined($publisher_name) || defined($publisher_contact_url) || defined($publisher_contact_email) ) {
	$writer->startTag('publisher');
	if ( defined($publisher_name) ) {
	    $writer->startTag('name',
			      'vocabulary' => 'DIF');
	    $writer->characters($publisher_name);
	    $writer->endTag('name');
	}
	if ( defined($publisher_contact_url) && defined($publisher_contact_email) ) {
	    $writer->emptyTag('contact',
			      'url' => $publisher_contact_url,
			      'email' => $publisher_contact_email);
	}else{
	    $writer->emptyTag('contact',
			      'email' => $publisher_contact_email) if defined($publisher_contact_email);
	    $writer->emptyTag('contact',
			      'url' => $publisher_contact_url) if defined($publisher_contact_url);
	}
	$writer->endTag('publisher');
    }
    if ( defined($timeCoverage_end) || defined($timeCoverage_duration) || defined($timeCoverage_start) ) {
	if ( defined($timeCoverage_end) + defined($timeCoverage_duration) + defined($timeCoverage_start) > 1 ) {
	    $writer->startTag('timeCoverage');
	    if ( defined($timeCoverage_end) ) {
		$writer->startTag('end');
		$writer->characters($timeCoverage_end);
		$writer->endTag('end');
	    }
	    if ( defined($timeCoverage_duration) ) {
		$writer->startTag('duration');
		$writer->characters($timeCoverage_duration);
		$writer->endTag('duration');
	    }
	    if ( defined($timeCoverage_start) ) {
		$writer->startTag('start');
		$writer->characters($timeCoverage_start);
		$writer->endTag('start');
	    }
	    $writer->endTag('timeCoverage');
	}
    }
    $writer->endTag('metadata');
}

sub writefooter {

    $writer->endTag('dataset');
    $writer->endTag('catalog');
    $writer->end();
}

sub initiatedataset {

    # 2 parameters; string datasetname, string dataseturlpath
    die "Internal error: initiatedataset missing input" if !defined($_[1]);
    $writer->startTag('dataset', 
		      'name'         => "$_[0]",
		      'ID'           => "$_[1]",
		      'urlPath'      => "$datasetRootPath/$_[1]");
    $writer->startTag('serviceName');
    $writer->characters('TOPAZ');
    $writer->endTag('serviceName');
    $writer->startTag('netcdf',
		      'xmlns'        => 'http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2');
#     $writer->emptyTag('dimension',
# 		      'name'         => 'time',
# 		      'length'       => '0');
    $writer->startTag('variable',
		      'name'         => 'time',
		      'type'         => 'int',
		      'shape'        => 'time');
    $writer->emptyTag('attribute',
		      'name'         => 'units',
		      'value'        => 'secs since 1970-01-01 00:00:00');
    $writer->emptyTag('attribute',
		      'name'         => '_CoordinateAxisType',
		      'value'        => 'time');
    $writer->endTag('variable');
    $writer->startTag('aggregation', 
		      'dimName'      => 'time',
		      'type'         => 'joinNew');
    #'recheckEvery' => '1 days'
    #'dateFormat'   => 'yyyy/M/d:hh:mm:ss z' );

    # insert variableAgg
    #$ncfile = `ls -1 $mersea_cat | head -1`;
    #$ncfile = glob( $mersea_cat );
    #chomp $ncfile;
    #@ncvars = `ncdump -h $ncfile | grep -v : | grep short | cut -c8- | sed 's/(.*//'`;
    @ncvars = (split " ", $aggvars);
    foreach (@ncvars) {
	chomp;
	$writer->emptyTag('variableAgg',
			  'name'         => "$_");
    }
}

sub enddataset {

    $writer->endTag('aggregation');
    $writer->endTag('netcdf');
    $writer->endTag('dataset');
}

sub finddate {

    # takes one parameter, a filename
    my $input = shift;
    die "Internal error: finddate missing input" if !$input;

    my $forc_date = $input;
    $forc_date =~ s/^.*_f//;
    $forc_date =~ s/9999\..*$//;
    
    die "Filename not containing a date\n" if length($forc_date) != 8 || $forc_date !~ /^\d+$/; # 8 digits

    my $forc_year  = substr($forc_date,0,4);
    my $forc_month = substr($forc_date,4,2);
    my $forc_dom   = substr($forc_date,6,2);
    
    return ($forc_year,$forc_month,$forc_dom);
}

sub template {

    use File::Listing;
    $output = new IO::File "$xmlfile", "w"; # start new clean file
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output);
    &writeheader();
    $output->close;
    for (parse_dir(`ls -l $mersea_cat`)) {
	(my $dname, my $type) = @$_;
	next if $type ne 'd'; # directory
	my $recfh = new IO::File "| $0 -nof -noh -a -" or die "Error in running recursively";
	foreach ( @inputscalars ) {
	    
	    if ( $_ eq 'mersea_cat' ) {
		$var = "$$_/$dname/*.nc";
	    }elsif ($_ eq 'dataseturlpath' ) {
		$var = "$$_-$dname";
	    }elsif ($_ eq 'datasetname' ) {
		$var = "$$_ $dname";
	    }elsif ($_ eq 'datasetRootLocation' ) {
		$var = "$datasetRootLocation$dname/"; #assumes we use append in recurs!
	    }else {
		$var = $$_;
	    }
	    $recfh->print("\$$_ = \"$var\";");
	}
	foreach ( @optinputscalars ) {
	    
	    $recfh->print("\$$_ = \"$$_\";");
	}
	$recfh->close;
    }
    $output = new IO::File "$xmlfile", "a"; # append footer
    $writer = XML::Writer->new(DATA_INDENT => 0 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1);
    &writefooter() if $footerflag;
    $output->close;

    print "Finished producing xml file: $xmlfile.\n" if $footerflag;
    exit;
}

sub checkprefix {

    (my $sb, my $file) = @_;
    my @prefix = split("::",$sb);
    if (  $appendflag && shift(@prefix) eq 'prefix' ) {

	return pop(@prefix)."/$file";
    }else{

	return $file;
    }
}

## End SUBROUTINES

## MAIN

&init();

## BEGIN READING INPUT

if ( $pipe ) {

    @lines = &remCommentsAndWS(<STDIN>);

}elsif ( @ARGV ) {
    
    @lines = &retrievefile(shift);
    print "So far $0 only takes one file at the time.\n" if shift;
}else { 
    
    &usage();
    die "Exiting!\n";
}

## END READING INPUT

@lines = split(";",join("",@lines)); # removes newlines

# Now @lines has only entries on the form "$scalar=EXPR"

## BEGIN INITIALIZATIONS

&assignscalars( @lines ); # does the assignments from the input[file]
&checkscalars(); # checks if all the neccesary scalars are present
&template() if $template;

if ( defined($bulletindates) ) {
    
    foreach ( split(",",$bulletindates) ) {

	if ( $_ =~ /^\s*be\s*$/ ) { 
	    
	    # has the user requested best estimates?
	    $be = 1; 
	}elsif ( $_ =~ /^\s*abd\s*$/ ) { 
	    
	    # has the user requested all bulletin dates?
	    $abd = 1; 
	}else { 

	    # add all requested bulletin dates
	    # Note: duplicates will be removed later
	    $_ = &removewhite($_);
	    push(@bulletindates,$_); 
	}
    }
}else{
    $be = 1;
    $abd = 1;
}

## END OF INITIALIZATIONS 

# find files
@listfiles=bsd_glob("$mersea_cat");
if ( $#listfiles < 0 )  {

    print "No files found in $mersea_cat \n";
    die "I quit \n";
}

foreach (@listfiles) {

    $tmp=$_;
    $tmp=~s/.*\///; # remove path
    push @files,$tmp ;
}

# 1) Get bulletin and forecast dates 
for ($i=0; $i<=$#files; $i++) {
    
    #Bulletin
    my $tmp=$files[$i];
    $tmp=~s/^.*_b//; # remove everything before bulletindate
    $tmp=~s/_f.*$//; # remove everything after bulletindate
    
    #Forecast
    my $tmp2=$files[$i]; 
    $tmp2=~s/^.*_f//; # remove everything before forecastdate
    $tmp2=~s/9999\..*$//; # remove everything after forecastdate
    
    # Create a "forecast" hash element (hash of hashes)
    # Each forecast date has an associated bulletin date (tmp) which points to
    # the filename
    if ( $be ) {
	
	$forecast{$tmp2}->{$tmp}=$files[$i];
    }

    # Create a "bulletin" hash element (hash of hashes)
    # Each bulletin date has an associated forecast date (tmp2) which points to
    # the filename
    if ( $#bulletindates >= 0) {
	
	$bulletins{$tmp}->{$tmp2}=$files[$i];
    }

    # If we are supposed to list all bulletindates, add all to the list
    if ( $abd ) {

	push(@bulletindates,$tmp); 
    }
}


## START SORTING

# Loop through hashes and find the hash with largest bulletin date 
# (this is the "best estimate")
for my $key1 (keys %forecast) {
    # Push new file into file array
    push @befiles,$forecast{$key1}->{max(keys %{ $forecast{$key1}})};
}
# Sort best estimate files
@befiles = sort @befiles;

# sort bulletin daterefs
@bulletindates = sort @bulletindates;
# and remove duplicates
$prev = 'nonesuch';
@bulletindates = grep($_ ne $prev && (($prev) = $_), @bulletindates);

## END SORTING

## START WRITING

# Write to XML file
if ( $appendflag ) {

    $output = new IO::File "$xmlfile", "a";
    $output->print("\n");
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1);
}else{
    
    $output = new IO::File "$xmlfile", "w";
    $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE =>1 , OUTPUT => $output);
}

&writeheader() if $headerflag;

if ( $be ) {
    
    &initiatedataset("Best estimate - $datasetname","$dataseturlpath-be");

    # Go through each file 
    foreach $element ( @befiles ) {
	
	# find forecastdate
	($forc_year,$forc_month,$forc_dom) = finddate( $element );

	#my $up = &checkprefix($datasetRootLocation,$element);
	chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`);
	$writer->emptyTag('netcdf', 
			  'location' => "$datasetRootLocation$element",
			  'coordValue'    => "$secs");
    }
    &enddataset();
}

if ( @bulletindates ) {

    # go trough each requested bulletin date
    foreach $bd ( @bulletindates ) {
	
	&initiatedataset("Bulletin $bd - $datasetname", "$dataseturlpath-$bd");

	# Go through each file 
	foreach ( sort keys %{ $bulletins{$bd} } ) {

	    # find forecastdate
	    ($forc_year,$forc_month,$forc_dom) = finddate( $bulletins{$bd}->{$_} );
	    
	    #my $up = &checkprefix($datasetRootLocation,$bulletins{$bd}->{$_});
	    chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`);
	    $writer->emptyTag('netcdf', 
			      'location'  => "$datasetRootLocation$bulletins{$bd}->{$_}",
			      'coordValue'     =>  "$secs" );
	    
	}
	&enddataset();
    }
}

&writefooter() if $footerflag;
$output->close();
print "Finished producing xml file: $xmlfile.\n" if $footerflag;
