#!/usr/bin/perl #-w # Script to generate Dods aggregation xml entries from files # # This routine actually should work on section and regular grid files.. # ##Changelog # v.4 introduced subroutines and templates (bjorges) # v.5 use to make XML according to Dataset Inventory Catalog # Specification 1.0.1 - only works on THREDDS 3.4 or newer! # (bjorges) # 20060127 changed inputscalar service to service_type # changed inputscalar servicebase to datasetRootLocation # added inputscalars service_name, service_base, datasetRootPath # 20060207 s/JoinNew/joinNew/ <- typo # 20060208 bugfix (servicebase) # 20061025 fixed template handling, added inputscalar aggvars # 20070330 commented out when upgrading # THREDDS from 3.6 to 3.14. # 20070418 Added things from a new metadata module # 20080521 KAL - parse_dir seems to fail on some machines - replaced # 20080521 KAL - Added security check for template: only works when mersea_cat is a directory # 20090105 KAL - Changed from joinNew to JoinExisting aggregation # 20090213 GZ - This joinNew ( used & works with class 2 & class 3 because the files does not have time as dim. & coord. var.) ## # Really should try harder to use these.. #use warnings; #use strict; ## BEGIN MODULE IMPORT use lib qw(/Data/topaz/Perl/lib/perl5/ /Data/topaz/Perl/lib/perl5/site_perl /Data/topaz/Knut/thredds-config/MetadataXML); #use lib qw(/export/home/nersc/Perl/lib/perl5 # /export/home/nersc/Perl/lib/perl5/site_perl # /Users/knutal/Testing/XML-gen/MetadataXML); #use Date::Calc qw(Delta_Days Add_Delta_Days); use Data::Dumper; use List::Util qw(min max); use XML::Writer; use IO::File; use File::Glob ':glob'; use Getopt::Long; #use DateTime; use Time::Local; require 'metadataGen.pl'; ## END MODULE IMPORT my @inputscalars = qw(mersea_cat dataseturlpath catalogname topdatasetname datasetname datasetRootLocation datasetRootPath service_name service_base service_type xmlfile aggvars geospatialCoverage); # New attribute - required my @optinputscalars = qw(bulletindates authority dataType dataFormat doc_rights doc_simple_href doc_simple_title doc_summary creator_name creator_contact_url creator_contact_email publisher_name publisher_contact_url publisher_contact_email timeCoverage_end timeCoverage_duration timeCoverage_start); # these are the ones we use, there are more # my ($headerflag,$footerflag,$appendflag,$template,$pipe); # my ($mersea_cat, # $dataseturlpath, # $catalogname, # $topdatasetname, # $datasetname, # $datasetRootLocation, # $datasetRootPath, # $service_name, # $service_base, # $service_type, # $xmlfile); # my ($bulletindates, # $authority, # $dataType, # $dataFormat, # $doc_rights, # $doc_simple_href, # $doc_simple_title, # $doc_summary, # $creator_name, # $creator_contact_url, # $creator_contact_email, # $publisher_name, # $publisher_contact_url, # $publisher_contact_email, # $timeCoverage_end, # $timeCoverage_duration, # $timeCoverage_start); # my @ncvars; # my $writer; # my $fh; # my $ncfile; # my @lines; # my $output; # my $bd; # my %bulletins; # my ($forc_year,$forc_dom,$forc_month); # my @bulletindates; # my $element; # my @befiles; # my ($be,$prev,$abd,$i); # my %forecast; # my @files; # my ($tmp,$var); # my @listfiles; ## BEGIN SUBROUTINES sub usage { print "Usage:\n\t$0 [OPTIONS] configfile[.input.pl]\n". "\t$0 [OPTIONS] - < configfile[.conf.pl]\n". "\tcat configfile | $0 [OPTIONS] -\n". "\n\tThe configfile must have all the scalars defined like this:\n". "\t\$scalar=\"value\";\n"; print "\n\tOPTIONS:\n". "\t-h[eaderoutput]\t\t(default)Include the XML-headers\n\n". "\t-noh[eaderoutput]\tDo not include the XML-headers\n\n". "\t-f[ooteroutput]\t\t(default)Include the XML-footers\n\n". "\t-nof[ooteroutput]\tDo not include the XML-footers\n\n". "\t-a[ppend]\t\tAppend to file as opposed to overwrite\n\n". "\t-noa[ppend]\t\t(default)Overwrite file as opposed to appending\n\n". "\t-t[emplate]\t\tThe configfile will be interpreted as a templatefile\n". "\t\t\t\twith possible ending .template.pl.\n\n". "\t-\t\t\tRead from standard input\n"; print "\n\tSCALARS - these scalars has to be defined in the configfile,\n"; foreach ( @inputscalars ) { print "\t\t \$$_ = \"VALUE(S)\";\n"; } print "\n\tSCALARS (optional) - these scalars may be defined in the configfile,\n"; foreach ( @optinputscalars ) { print "\t\t \$$_ = \"VALUE(S)\";\n"; } } sub init { $headerflag = 1; $footerflag = 1; GetOptions("headeroutput!"=>\$headerflag, "footeroutput!"=>\$footerflag, "append!"=>\$appendflag, "template"=>\$template, ""=>\$pipe); #print "template=$template\n"; } sub retrievefile { # takes one parameter, the prefix to or the filename my $filename = shift; if ( -e $filename ) { }elsif ( -e $filename.".input.pl" && !$template ) { $filename .= ".input.pl"; }elsif ( -e $filename.".template.pl" && $template ) { $filename .= ".template.pl"; }else{ print "File not found: $filename\n\n"; &usage(); die "\nFile not found.\n"; } $fh = new IO::File $filename, "r"; @lines = <$fh>; @lines = &remCommentsAndWS( @lines ); $fh->close(); return @lines; } sub remCommentsAndWS { my @input = @_; my @lines = (); foreach ( @input ) { s/#.*//; # ignore comments by erasing them # NB!NB! also removes any $#array next if /^\s*$/; # skip blank lines chomp; # remove trailing newline characters push @lines, $_; # push the data line onto the array } @lines = &removewhite(@lines); return @lines; } sub assignscalars { my $vars = join("|",@inputscalars)."|".join("|",@optinputscalars); foreach ( @_ ) { if ( $_ =~ /^\s*\$($vars)\s*=\s*\"(.*)\"/ ) { $$1 = "$2"; } } } sub checkscalars { foreach ( @inputscalars ) { if ( !defined($$_) ) { &usage(); print "\nYour file must define all neccesary variables/scalars:\n\n"; foreach ( @inputscalars ) { print "\t$_ is set to $$_\n"; } foreach ( @optinputscalars ) { print "\t(optional) $_ is set to $$_\n"; } die "Exiting!"; } } } sub removewhite { my @words = @_; foreach (@words) { $_ =~ s/^\s+//; #remove leading whitespace $_ =~ s/\s+$//; #remove trailing whitespace } return @words; } sub writeheader { #KAL -- Added metadata retrieval subroutine, present in the metadata package require 'metadataGen.pl'; #KAL -- Get "Expiry date for data set $expiry_date=metadata::getexpiry($mersea_cat); $writer->xmlDecl("UTF-8"); $writer->startTag('catalog', 'name' => "$catalogname", 'version' => '1.0.1', 'xmlns' => 'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0', 'xmlns:xlink' => 'http://www.w3.org/1999/xlink', 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', 'xsi:schemaLocation'=>'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0 http://www.unidata.ucar.edu/schemas/thredds/InvCatalog.1.0.xsd', 'expires' => "$expiry_date"); $writer->startTag('dataset', 'name' => "$topdatasetname"); $writer->emptyTag('service', 'name' => 'this', 'serviceType' => 'OpenDAP', 'base' => ''); $writer->startTag('service', 'name' => "$service_name", 'serviceType' => "$service_type", #must be OPENDAP 'base' => "$service_base"); $writer->emptyTag('datasetRoot', 'path' => "$datasetRootPath", 'location' => "$datasetRootLocation"); $writer->endTag('service'); $writer->startTag('metadata', 'inherited' => 'true'); #KAL -- Add aggregation level property tag for mersea - this depends on (for now) the dataseturlpath &metadata::agglevelproperty('view','no',$writer,$dataseturlpath); $writer->startTag('serviceName'); $writer->characters('this'); $writer->endTag('serviceName'); if ( defined($authority) ) { $writer->startTag('authority'); $writer->characters($authority); $writer->endTag('authority'); } if ( defined($dataType) ) { $writer->startTag('dataType'); $writer->characters($dataType); $writer->endTag('dataType'); } if ( defined($dataFormat) ) { $writer->startTag('dataFormat'); $writer->characters($dataFormat); $writer->endTag('dataFormat'); } if ( defined($doc_rights) ) { $writer->startTag('documentation', 'type' => 'rights'); $writer->characters($doc_rights); $writer->endTag('documentation'); } if ( defined($doc_simple_href) && defined($doc_simple_title) ) { $writer->emptyTag('documentation', 'xlink:href' => $doc_simple_href, 'xlink:title'=> $doc_simple_title); } if ( defined($doc_summary) ) { $writer->startTag('documentation', 'type' => 'summary'); $writer->characters($doc_summary); $writer->endTag('documentation'); } if ( defined($creator_name) || defined($creator_contact_url) || defined($creator_contact_email) ) { $writer->startTag('creator'); if ( defined($creator_name) ) { $writer->startTag('name', 'vocabulary' => 'DIF'); $writer->characters($creator_name); $writer->endTag('name'); } if ( defined($creator_contact_url) && defined($creator_contact_email) ) { $writer->emptyTag('contact', 'url' => $creator_contact_url, 'email' => $creator_contact_email); }else{ $writer->emptyTag('contact', 'email' => $creator_contact_email) if defined($creator_contact_email); $writer->emptyTag('contact', 'url' => $creator_contact_url) if defined($creator_contact_url); } $writer->endTag('creator'); } if ( defined($publisher_name) || defined($publisher_contact_url) || defined($publisher_contact_email) ) { $writer->startTag('publisher'); if ( defined($publisher_name) ) { $writer->startTag('name', 'vocabulary' => 'DIF'); $writer->characters($publisher_name); $writer->endTag('name'); } if ( defined($publisher_contact_url) && defined($publisher_contact_email) ) { $writer->emptyTag('contact', 'url' => $publisher_contact_url, 'email' => $publisher_contact_email); }else{ $writer->emptyTag('contact', 'email' => $publisher_contact_email) if defined($publisher_contact_email); $writer->emptyTag('contact', 'url' => $publisher_contact_url) if defined($publisher_contact_url); } $writer->endTag('publisher'); } #KAL - this is done by a different subroutine now &metadata::main_metadatagen($mersea_cat,'no',$writer,$geospatialCoverage); $writer->endTag('metadata'); } sub writefooter { $writer->endTag('dataset'); $writer->endTag('catalog'); $writer->end(); } sub initiatedataset { my ( $name , $ID , @dates ) =@_; # 2 parameters; string datasetname, string dataseturlpath die "Internal error: initiatedataset missing input" if !defined($_[1]); $writer->startTag('dataset', 'name' => "$_[0]", 'ID' => "$_[1]", 'urlPath' => "$datasetRootPath/$_[1]"); $writer->startTag('metadata', 'inherited' => 'true'); &metadata::agglevelproperty('view','no',$writer,$dataseturlpath); &metadata::metadata_timeCoverage($writer,'no',@dates); $writer->endTag('metadata'); $writer->startTag('serviceName'); $writer->characters('TOPAZ'); $writer->endTag('serviceName'); $writer->startTag('netcdf', 'xmlns' => 'http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2'); # $writer->emptyTag('dimension', # 'name' => 'time', # 'length' => '0'); $writer->startTag('variable', 'name' => 'time', 'type' => 'int', 'shape' => 'time'); $writer->emptyTag('attribute', 'name' => 'units', 'value' => 'secs since 1970-01-01 00:00:00'); $writer->emptyTag('attribute', 'name' => '_CoordinateAxisType', 'value' => 'time'); $writer->endTag('variable'); $writer->startTag('aggregation', 'dimName' => 'time', 'type' => 'joinNew'); # $writer->startTag('aggregation', # 'dimName' => 'time', # 'type' => 'joinExisting'); #'recheckEvery' => '1 days' #'dateFormat' => 'yyyy/M/d:hh:mm:ss z' ); # insert variableAgg #$ncfile = `ls -1 $mersea_cat | head -1`; #$ncfile = glob( $mersea_cat ); #chomp $ncfile; #@ncvars = `ncdump -h $ncfile | grep -v : | grep short | cut -c8- | sed 's/(.*//'`; @ncvars = (split " ", $aggvars); foreach (@ncvars) { chomp; $writer->emptyTag('variableAgg', 'name' => "$_"); } } sub enddataset { $writer->endTag('aggregation'); $writer->endTag('netcdf'); $writer->endTag('dataset'); } sub finddate { # takes one parameter, a filename my $input = shift; die "Internal error: finddate missing input" if !$input; my $forc_date = $input; $forc_date =~ s/^.*_f//; $forc_date =~ s/9999\..*$//; die "Filename not containing a date\n" if length($forc_date) != 8 || $forc_date !~ /^\d+$/; # 8 digits my $forc_year = substr($forc_date,0,4); my $forc_month = substr($forc_date,4,2); my $forc_dom = substr($forc_date,6,2); return ($forc_year,$forc_month,$forc_dom); } sub findbdate { # Returns bulletin date # takes one parameter, a filename my $input = shift; die "Internal error: findbdate missing input" if !$input; my $bull_date = $input; $bull_date =~ s/^.*_b//; $bull_date =~ s/_f.*$//; #print "bull date: $bull_date \n"; die "Filename not containing a date\n" if length($bull_date) != 8 || $bull_date !~ /^\d+$/; # 8 digits my $bull_year = substr($bull_date,0,4); my $bull_month = substr($bull_date,4,2); my $bull_dom = substr($bull_date,6,2); return ($bull_year,$bull_month,$bull_dom); } sub template { use File::Listing; use File::Basename; #KAL - if template, mersea_cat must be directory -d $mersea_cat or die "For templates mersea_cat must be a directory"; $output = new IO::File "$xmlfile", "w"; # start new clean file $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output); &writeheader(); $output->close; # for (parse_dir(`ls -l $mersea_cat`)) { # (my $dname, my $type) = @$_; # next if $type ne 'd'; # directory # print "$dname \n"; for (glob "$mersea_cat" . "/*") { $_=basename $_; $dname = $_; next if not -d "$mersea_cat/$_" ; # skip if not directory my $recfh = new IO::File "| $0 -nof -noh -a -" or die "Error in running recursively"; foreach ( @inputscalars ) { if ( $_ eq 'mersea_cat' ) { $var = "$$_/$dname/*.nc"; }elsif ($_ eq 'dataseturlpath' ) { $var = "$$_-$dname"; }elsif ($_ eq 'datasetname' ) { $var = "$$_ $dname"; }elsif ($_ eq 'datasetRootLocation' ) { $var = "$datasetRootLocation$dname/"; #assumes we use append in recurs! }else { $var = $$_; } $recfh->print("\$$_ = \"$var\";"); } foreach ( @optinputscalars ) { $recfh->print("\$$_ = \"$$_\";"); } $recfh->close; } $output = new IO::File "$xmlfile", "a"; # append footer $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1); &writefooter() if $footerflag; $output->close; print "Finished producing xml file: $xmlfile.\n" if $footerflag; exit; } sub checkprefix { (my $sb, my $file) = @_; my @prefix = split("::",$sb); if ( $appendflag && shift(@prefix) eq 'prefix' ) { return pop(@prefix)."/$file"; }else{ return $file; } } ## End SUBROUTINES ## MAIN &init(); ## BEGIN READING INPUT if ( $pipe ) { @lines = &remCommentsAndWS(); }elsif ( @ARGV ) { @lines = &retrievefile(shift); print "So far $0 only takes one file at the time.\n" if shift; }else { &usage(); die "Exiting!\n"; } ## END READING INPUT @lines = split(";",join("",@lines)); # removes newlines # Now @lines has only entries on the form "$scalar=EXPR" ## BEGIN INITIALIZATIONS &assignscalars( @lines ); # does the assignments from the input[file] &checkscalars(); # checks if all the neccesary scalars are present &template() if $template; if ( defined($bulletindates) ) { foreach ( split(",",$bulletindates) ) { if ( $_ =~ /^\s*be\s*$/ ) { # has the user requested best estimates? $be = 1; }elsif ( $_ =~ /^\s*abd\s*$/ ) { # has the user requested all bulletin dates? $abd = 1; }else { # add all requested bulletin dates # Note: duplicates will be removed later $_ = &removewhite($_); push(@bulletindates,$_); } } }else{ $be = 1; $abd = 1; } ## END OF INITIALIZATIONS # find files @listfiles=bsd_glob("$mersea_cat"); if ( $#listfiles < 0 ) { print "No files found in $mersea_cat \n"; die "I quit \n"; } foreach (@listfiles) { $tmp=$_; $tmp=~s/.*\///; # remove path push @files,$tmp ; } # 1) Get bulletin and forecast dates for ($i=0; $i<=$#files; $i++) { # #Bulletin # my $tmp=$files[$i]; # $tmp=~s/^.*_b//; # remove everything before bulletindate # $tmp=~s/_f.*$//; # remove everything after bulletindate # # #Forecast # my $tmp2=$files[$i]; # $tmp2=~s/^.*_f//; # remove everything before forecastdate # $tmp2=~s/9999\..*$//; # remove everything after forecastdate my $tmp = join "" , findbdate($files[$i]) ; my $tmp2 = join "" , finddate ($files[$i]) ; # Create a "forecast" hash element (hash of hashes) # Each forecast date has an associated bulletin date (tmp) which points to # the filename if ( $be ) { $forecast{$tmp2}->{$tmp}=$files[$i]; } # Create a "bulletin" hash element (hash of hashes) # Each bulletin date has an associated forecast date (tmp2) which points to # the filename if ( $#bulletindates >= 0) { $bulletins{$tmp}->{$tmp2}=$files[$i]; } # If we are supposed to list all bulletindates, add all to the list if ( $abd ) { push(@bulletindates,$tmp); } } ## START SORTING # Loop through hashes and find the hash with largest bulletin date # (this is the "best estimate") for my $key1 (keys %forecast) { # Push new file into file array push @befiles,$forecast{$key1}->{max(keys %{ $forecast{$key1}})}; } # Sort best estimate files @befiles = sort @befiles; # sort bulletin daterefs @bulletindates = sort @bulletindates; # and remove duplicates $prev = 'nonesuch'; @bulletindates = grep($_ ne $prev && (($prev) = $_), @bulletindates); ## END SORTING ## START WRITING # Write to XML file if ( $appendflag ) { $output = new IO::File "$xmlfile", "a"; $output->print("\n"); $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE => 1 , OUTPUT => $output, UNSAFE => 1); }else{ $output = new IO::File "$xmlfile", "w"; $writer = XML::Writer->new(DATA_INDENT => 1 , DATA_MODE =>1 , OUTPUT => $output); } &writeheader() if $headerflag; if ( $be ) { &initiatedataset("Best estimate - $datasetname","$dataseturlpath-be",@befiles); # Go through each file foreach $element ( @befiles ) { # find forecastdate ($forc_year,$forc_month,$forc_dom) = finddate( $element ); #my $up = &checkprefix($datasetRootLocation,$element); #chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`); #my $dt = DateTime->new( year => $forc_year, month => $forc_month, day => $forc_dom , # time_zone => 'UTC' ) ; #my $secs=$dt->epoch(); my $secs = timelocal(0,0,0,$forc_dom,$forc_month-1,$forc_year); # Month in range 0..11 $writer->emptyTag('netcdf', 'location' => "$datasetRootLocation$element", 'coordValue' => "$secs"); } &enddataset(); } if ( @bulletindates ) { # go trough each requested bulletin date foreach $bd ( @bulletindates ) { #KAL -- Before we initiate the dataset, we must know stop and end dates #my @flist = sort keys % { $bulletins{$bd} } ; #print @flist; # &initiatedataset("Bulletin $bd - $datasetname", "$dataseturlpath-$bd", sort keys % { $bulletins{$bd} }); # Go through each file foreach ( sort keys %{ $bulletins{$bd} } ) { # find forecastdate ($forc_year,$forc_month,$forc_dom) = finddate( $bulletins{$bd}->{$_} ); #my $up = &checkprefix($datasetRootLocation,$bulletins{$bd}->{$_}); #chomp (my $secs = `date -u -d $forc_month/$forc_dom/$forc_year +%s`); #my $dt = DateTime->new( year => $forc_year, month => $forc_month, day => $forc_dom , # time_zone => 'UTC' ) ; #my $secs=$dt->epoch(); my $secs = timelocal(0,0,0,$forc_dom,$forc_month-1,$forc_year); # Month in range 0..11 $writer->emptyTag('netcdf', 'location' => "$datasetRootLocation$bulletins{$bd}->{$_}", 'coordValue' => "$secs" ); } &enddataset(); } } &writefooter() if $footerflag; $output->close(); print "Finished producing xml file: $xmlfile.\n" if $footerflag;