Add limited XML create-template support

Bjwebb · Bjwebb · commit 56880b8088dd · 2018-08-14T15:06:15.000+01:00
OpenDataServices/cove#775 Based on this unmerged commit to CoVE OpenDataServices/cove@e274142
diff --git a/examples/help/create-template/expected.txt b/examples/help/create-template/expected.txt
@@ -1,6 +1,9 @@
-usage: flatten-tool create-template [-h] -s SCHEMA [-f {csv,xlsx,all}]
+usage: flatten-tool create-template [-h] [-s SCHEMA] [-f {csv,xlsx,all}]
                                     [-m MAIN_SHEET_NAME] [-o OUTPUT_NAME]
                                     [--rollup] [-r ROOT_ID] [--use-titles]
+                                    [--xml]
+                                    [--xml-schema [XML_SCHEMA [XML_SCHEMA ...]]]
+                                    [--root-list-path ROOT_LIST_PATH]
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -22,3 +25,9 @@ optional arguments:
   -r ROOT_ID, --root-id ROOT_ID
                         Root ID of the data format, e.g. ocid for OCDS
   --use-titles          Convert titles. Requires a schema to be specified.
+  --xml                 Use XML as the input format
+  --xml-schema [XML_SCHEMA [XML_SCHEMA ...]]
+                        Path to one or more XML schemas
+  --root-list-path ROOT_LIST_PATH
+                        Path of the root list, defaults to main. Needed for
+                        XML template creation only.
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
@@ -5,23 +5,28 @@
 from flattentool.input import FORMATS as INPUT_FORMATS
 from flattentool.xml_output import toxml
 from flattentool.lib import parse_sheet_configuration
+from flattentool.xml_create_template import XMLSchemaParser
 import sys
 import json
 import codecs
 from decimal import Decimal
 from collections import OrderedDict
 
 
-def create_template(schema, output_name='template', output_format='all', main_sheet_name='main',
-                    rollup=False, root_id=None, use_titles=False, **_):
+def create_template(schema=None, output_name='template', output_format='all', main_sheet_name='main',
+                    rollup=False, root_id=None, use_titles=False,
+                    xml=False, xml_schemas=None, root_list_path=None, **_):
     """
     Creates template file(s) from given inputs
     This function is built to deal with commandline input and arguments
     but to also be called from elswhere in future
 
     """
 
-    parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles)
+    if xml:
+        parser = XMLSchemaParser(xml_schemas=xml_schemas, root_list_path=root_list_path)
+    else: 
+        parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles)
     parser.parse()
 
     def spreadsheet_output(spreadsheet_output_class, name):
diff --git a/flattentool/cli.py b/flattentool/cli.py
@@ -36,10 +36,10 @@ def create_parser():
     parser_create_template = subparsers.add_parser(
         'create-template',
         help='Create a template from the given schema')
-    parser_create_template.add_argument(
+    schema_group = parser_create_template.add_mutually_exclusive_group(required=True)
+    schema_group.add_argument(
         "-s", "--schema",
-        help="Path to the schema file you want to use to create the template",
-        required=True)
+        help="Path to the schema file you want to use to create the template")
     parser_create_template.add_argument(
         "-f", "--output-format",
         help="Type of template you want to create. Defaults to all available options",
@@ -61,6 +61,19 @@ def create_parser():
         "--use-titles",
         action='store_true',
         help="Convert titles. Requires a schema to be specified.")
+    parser_create_template.add_argument(
+        "--xml",
+        action='store_true',
+        help="Use XML as the input format")
+    schema_group.add_argument(
+        "--xml-schema",
+        dest='xml_schemas',
+        metavar='XML_SCHEMA',
+        nargs='*',
+        help="Path to one or more XML schemas")
+    parser_create_template.add_argument(
+        "--root-list-path",
+        help="Path of the root list, defaults to main. Needed for XML template creation only.")
 
     parser_flatten = subparsers.add_parser(
         'flatten',
diff --git a/flattentool/sort_xml.py b/flattentool/sort_xml.py
@@ -69,7 +69,7 @@ def get_schema_element(self, tag_name, name_attribute):
                 return schema_element
         return schema_element
 
-    def element_loop(self, element, path):
+    def element_loop(self, element):
         """
         Return information about the children of the supplied element.
         """
@@ -95,14 +95,12 @@ def element_loop(self, element, path):
                 'xsd:complexType/xsd:all/xsd:element',
                 namespaces=namespaces)
             + type_elements)
-        child_tuples = []
         for child in children:
             a = child.attrib
             if 'name' in a:
-                child_tuples.append((a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs')))
+                yield a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs')
             else:
-                child_tuples.append((a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs')))
-        return child_tuples
+                yield a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs')
 
     def create_schema_dict(self, parent_name, parent_element=None):
         """
@@ -114,7 +112,7 @@ def create_schema_dict(self, parent_name, parent_element=None):
 
         return OrderedDict([
             (name, self.create_schema_dict(name, element))
-            for name, element, _, _, _ in self.element_loop(parent_element, '')])
+            for name, element, _, _, _ in self.element_loop(parent_element)])
 
 
 def sort_element(element, schema_subdict):
diff --git a/flattentool/xml_create_template.py b/flattentool/xml_create_template.py
@@ -0,0 +1,103 @@
+import sys
+
+from .sort_xml import XMLSchemaWalker, namespaces
+from .sheet import Sheet
+
+
+class XMLSchemaWalkerForTemplate(XMLSchemaWalker):
+    def attribute_loop(self, element):
+        """
+        Returns a list containing a tuple for each attribute the given element
+        can have.
+        The format of the tuple is (name, is_required)
+        """
+        #if element.find("xsd:complexType[@mixed='true']", namespaces=namespaces) is not None:
+        #    print_column_info('text', indent)
+            
+        a = element.attrib
+        type_attributes = []
+        type_attributeGroups = []
+        if 'type' in a:
+            complexType = self.get_schema_element('complexType', a['type'])
+            if complexType is not None:
+                type_attributes = (
+                    complexType.findall('xsd:attribute', namespaces=namespaces) +
+                    complexType.findall('xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces)
+                    )
+                type_attributeGroups = (
+                    complexType.findall('xsd:attributeGroup', namespaces=namespaces) +
+                    complexType.findall('xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces)
+                    )
+
+        group_attributes = []
+        for attributeGroup in (
+                element.findall('xsd:complexType/xsd:attributeGroup', namespaces=namespaces) +
+                element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces) +
+                type_attributeGroups
+                ):
+            group_attributes += self.get_schema_element('attributeGroup', attributeGroup.attrib['ref']).findall('xsd:attribute', namespaces=namespaces)
+
+        for attribute in (
+                element.findall('xsd:complexType/xsd:attribute', namespaces=namespaces) +
+                element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces) +
+                type_attributes + group_attributes
+                ):
+            doc = attribute.find(".//xsd:documentation", namespaces=namespaces)
+            if 'ref' in attribute.attrib:
+                referenced_attribute = self.get_schema_element('attribute', attribute.get('ref'))
+                if referenced_attribute is not None:
+                    attribute = referenced_attribute
+                if doc is None:
+                    # Only fetch the documentation of the referenced definition
+                    # if we don't already have documentation.
+                    doc = attribute.find(".//xsd:documentation", namespaces=namespaces)
+            yield attribute.get('name') or attribute.get('ref'), attribute.get('use') == 'required'
+
+    def has_simple_content(self, element):
+        a = element.attrib
+        simple_content = False
+        if 'type' in a:
+            complexType = self.get_schema_element('complexType', a['type'])
+            if complexType is not None:
+                simple_content = bool(complexType.findall('xsd:simpleContent', namespaces=namespaces))
+        return simple_content or bool(element.findall('xsd:complexType/xsd:simpleContent', namespaces=namespaces))
+
+    def generate_paths(self, parent_name, parent_element=None, parent_path=''):
+        if parent_element is None:
+            parent_element = self.get_schema_element('element', parent_name)
+
+        for name, required, in self.attribute_loop(parent_element):
+            if name == 'xml:lang':
+                # Namespaces not supported yet https://github.com/OpenDataServices/flatten-tool/issues/148
+                # And no way to specify two narrative elements anyway https://github.com/OpenDataServices/cove/issues/777
+                continue
+            yield parent_path + '@' + name
+
+        for name, element, _, minOccurs, maxOccurs in self.element_loop(parent_element):
+            if element is None:
+                element = self.get_schema_element('element', name)
+            path = parent_path + name
+            if self.has_simple_content(element):
+                yield path
+            if maxOccurs == 'unbounded' or int(maxOccurs) > 1:
+                path += '/0/'
+            else:
+                path += '/'
+            yield from list(self.generate_paths(name, element, path))
+
+
+class XMLSchemaParser(object):
+    """Parse the fields of a JSON schema into a flattened structure."""
+
+    def __init__(self, xml_schemas=[], root_list_path=None):
+        self.sub_sheets = {}
+        self.main_sheet = Sheet()
+        self.sub_sheet_mapping = {}
+        self.xml_schemas = xml_schemas
+        assert root_list_path is not None
+        self.root_list_path = root_list_path
+
+    def parse(self):
+        for path in XMLSchemaWalkerForTemplate(self.xml_schemas).generate_paths(self.root_list_path):
+            self.main_sheet.append(path)
+