simplify shell script n update USGS level 1 to accomodate gz n mtl.tx…

…t at the same time
simonaoliver · Jun 20, 2018 · 0002941 · 0002941
1 parent a2b9ea6
commit 0002941
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 16 deletions.
diff --git a/do_qsub_l1.sh b/do_qsub_l1.sh
@@ -1,2 +1,4 @@
-for i in `find /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts_l1 -name "*.qsub"`; do qsub $i; done
+HOME=/g/data2/v10/AGDCv2/datacube-ingestion/indexed-products/ledaps_lasrc/opendatacubepipelines.ledapslasrc
+
+for i in `find $HOME/qsub_scripts_l1 -name "*.qsub"`; do qsub $i; done
 
diff --git a/ledaps_lasrc_pbs_tasker.sh b/ledaps_lasrc_pbs_tasker.sh
@@ -1,2 +1,5 @@
-for i in `ls -1 /g/data/v10/projects/ARD_interoperability/L2/unzip`; do if [[ $i != *":"* ]]; then  mkdir -p /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/yamls/$i ; fi; done
-for i in `ls /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/yamls/`; do cp ledaps_lasrc_prepare.sh /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts/$i.qsub; sed -i -e "s/TARGET/$i/g" "/g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts/$i.qsub"; echo 'qsub /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts/'$i'.qsub'; done
+HOME=/g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc 
+DATA=/g/data/dz56/ARD_interoperability/L2
+
+for i in `ls -1 $DATA/unzip`; do if [[ $i != *":"* ]]; then  mkdir -p $DATA/yamls/$i ; fi; done
+for i in `ls $DATA/yamls/`; do cp ledaps_lasrc_prepare.sh $HOME/qsub_scripts/$i.qsub; sed -i -e "s/TARGET/$i/g" "$HOME/qsub_scripts/$i.qsub"; echo 'qsub $HOME/qsub_scripts/'$i'.qsub'; done
diff --git a/ledaps_lasrc_pbs_tasker_l1.sh b/ledaps_lasrc_pbs_tasker_l1.sh
@@ -1,2 +1,7 @@
-for i in `ls -1 /g/data/v10/projects/ARD_interoperability/L1`; do if [[ $i != *":"* ]]; then  mkdir -p /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/yamls_l1/$i ; fi; done
-for i in `ls /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/yamls_l1/`; do cp ledaps_lasrc_prepare_l1.sh /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts_l1/$i.qsub; sed -i -e "s/TARGET/$i/g" "/g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts_l1/$i.qsub"; echo 'qsub /g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc/qsub_scripts_l1/'$i'.qsub'; done
+#HOME=/g/data1b/da82/AODH/USGS/L1/Landsat/C1
+#DATA=/g/data1b/da82/AODH/USGS/L1/Landsat/C1
+HOME=/g/data2/v10/AGDCv2/datacube-ingestion/indexed-products/ledaps_lasrc/opendatacubepipelines.ledapslasrc
+DATA=/g/data2/v10/AGDCv2/datacube-ingestion/indexed-products/ledaps_lasrc/opendatacubepipelines.ledapslasrc/test_data
+
+for i in `ls -1 $DATA`; do if [[ $i != *":"* ]]; then  mkdir -p $HOME/yamls_test/$i ; fi; done
+for i in `ls $HOME/yamls_test/`; do cp ledaps_lasrc_prepare_l1.sh $HOME/qsub_scripts_l1/$i.qsub; sed -i -e "s/TARGET/$i/g" "$HOME/qsub_scripts_l1/$i.qsub"; echo 'qsub $HOME/qsub_scripts_l1/'$i'.qsub'; done
diff --git a/ledaps_lasrc_prepare.sh b/ledaps_lasrc_prepare.sh
@@ -13,6 +13,6 @@ module load gaip/dev-sen2redo
 module load parallel
 
 HOME=/g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc
-DATA=/g/data/v10/projects/ARD_interoperability/L2
+DATA=/g/data/dz56/ARD_interoperability/L2
 
-find $DATA/unzip/TARGET -name *.xml | parallel --jobs 16 "python $HOME/ls_usgs_l2_prepare.py {} --output $HOME/yamls/TARGET --no-checksum --date 1/1/1999"
+find $DATA/unzip/TARGET -name *.xml | parallel --jobs 16 "python $HOME/ls_usgs_l2_prepare.py {} --output $DATA/yamls/TARGET --no-checksum --date 1/1/1999"
diff --git a/ledaps_lasrc_prepare_l1.sh b/ledaps_lasrc_prepare_l1.sh
@@ -12,7 +12,7 @@ module load gaip/dev-sen2redo
 #module load agdc-py3-prod/1.5.1
 module load parallel
 
-HOME=/g/data/v10/AGDCv2/indexed_datasets/ledaps_lasrc/opendatacubepipelines.ledapslasrc
-DATA=/g/data/v10/projects/ARD_interoperability/L1
+HOME=/g/data2/v10/AGDCv2/datacube-ingestion/indexed-products/ledaps_lasrc/opendatacubepipelines.ledapslasrc
+DATA=/g/data2/v10/AGDCv2/datacube-ingestion/indexed-products/ledaps_lasrc/opendatacubepipelines.ledapslasrc/test_data
 
-find $DATA/TARGET -name *_MTL.txt | parallel --jobs 16 "python $HOME/ls_usgs_l1_prepare.py {} --output $HOME/yamls_l1/TARGET --no-checksum --date 1/1/1999"
+find $DATA/TARGET -name *_MTL.txt -o -name *.gz | parallel --jobs 16 "python $HOME/ls_usgs_l1_prepare.py {} --output $HOME/yamls_test/TARGET --no-checksum --date 1/1/1999"
diff --git a/ls_usgs_l1_prepare.py b/ls_usgs_l1_prepare.py
@@ -15,6 +15,8 @@
 from datetime import datetime
 from os.path import join as pjoin
 import hashlib
+import tarfile
+import glob
 
 images1 = [('1', 'coastal_aerosol'),
            ('2', 'blue'),
@@ -214,6 +216,29 @@ def absolutify_paths(doc, path):
     return doc
 
 
+def find_gz_mtl(ds_path, output_folder):
+    """
+    Find the MTL metadata file for the archived dataset and extract the xml 
+    file and store it temporally in output folder
+
+    :param ds_path: the dataset path
+    :param output_folder: the output folder
+
+    :returns: xml with full path 
+
+    """
+
+    mtl_path = ''
+
+    reT = re.compile("MTL.txt")
+    tar_gz = tarfile.open(str(ds_path), 'r')
+    members=[m for m in tar_gz.getmembers() if reT.search(m.name)]
+    tar_gz.extractall(output_folder, members)
+    mtl_path = pjoin(output_folder, members[0].name)
+
+    return mtl_path
+
+
 @click.command(help="""\b
                     Prepare USGS Landsat Collection 1 data for ingestion into the Data Cube.
                     This prepare script supports only for MTL.txt metadata file
@@ -239,8 +264,8 @@ def absolutify_paths(doc, path):
 
 def main(output, datasets, checksum, date):
     logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', 
-                        level=logging.INFO)    
-
+                        level=logging.INFO)
+                        
     for ds in datasets:         
         (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(ds) 
         create_date = datetime.utcfromtimestamp(ctime) 
@@ -249,12 +274,20 @@ def main(output, datasets, checksum, date):
                          " is older than start date ", date, "...SKIPPING") 
         else:
             ds_path = Path(ds)
-            if ds_path.suffix in ('MTL.txt'):                
-                mtl_path = str(ds_path)
-                ds_path = os.path.dirname(str(ds_path))
+            #if ds_path.suffix in ('.gz', 'MTL.txt'):
+            if ds_path.suffix in ('.gz', '.txt'):                 
+                if ds_path.suffix != '.txt': 
+                    mtl_path = find_gz_mtl(ds_path, output)
+                    if mtl_path == '':
+                        raise RuntimeError('no MTL file under the product folder')
+                else:
+                    mtl_path = str(ds_path)                               
+
+                ds_path = os.path.dirname(str(ds_path))                    
 
+                #print (mtl_path)                 
                 logging.info("Processing %s", ds_path) 
-                output_yaml = pjoin(output, '{}.yaml'.format(os.path.basename(ds_path)))
+                output_yaml = pjoin(output, '{}.yaml'.format(os.path.basename(mtl_path).replace('_MTL.txt', '')))
                 logging.info("Output %s", output_yaml)
                 if os.path.exists(output_yaml): 
                     logging.info("Output already exists %s", output_yaml) 
@@ -275,5 +308,14 @@ def main(output, datasets, checksum, date):
                 with open(output_yaml, 'w') as stream:
                     yaml.dump(docs, stream)
 
+    #delete intermediate MTL files for archive datasets in output folder
+    mtl_list = glob.glob('{}/*MTL.txt'.format(output))
+    if len(mtl_list) > 0:
+        for f in mtl_list:
+            try:
+                os.remove(f)
+            except OSError:
+                pass
+
 if __name__ == "__main__":
     main()