diff --git a/examples/help/create-template/expected.txt b/examples/help/create-template/expected.txt index c3781aa2..7596f885 100644 --- a/examples/help/create-template/expected.txt +++ b/examples/help/create-template/expected.txt @@ -1,6 +1,9 @@ -usage: flatten-tool create-template [-h] -s SCHEMA [-f {csv,xlsx,all}] +usage: flatten-tool create-template [-h] [-s SCHEMA] [-f {csv,xlsx,all}] [-m MAIN_SHEET_NAME] [-o OUTPUT_NAME] [--rollup] [-r ROOT_ID] [--use-titles] + [--xml] + [--xml-schema [XML_SCHEMA [XML_SCHEMA ...]]] + [--root-list-path ROOT_LIST_PATH] optional arguments: -h, --help show this help message and exit @@ -22,3 +25,9 @@ optional arguments: -r ROOT_ID, --root-id ROOT_ID Root ID of the data format, e.g. ocid for OCDS --use-titles Convert titles. Requires a schema to be specified. + --xml Use XML as the input format + --xml-schema [XML_SCHEMA [XML_SCHEMA ...]] + Path to one or more XML schemas + --root-list-path ROOT_LIST_PATH + Path of the root list, defaults to main. Needed for + XML template creation only. diff --git a/flattentool/__init__.py b/flattentool/__init__.py index 33b92e7d..a9b62331 100644 --- a/flattentool/__init__.py +++ b/flattentool/__init__.py @@ -5,6 +5,7 @@ from flattentool.input import FORMATS as INPUT_FORMATS from flattentool.xml_output import toxml from flattentool.lib import parse_sheet_configuration +from flattentool.xml_create_template import XMLSchemaParser import sys import json import codecs @@ -12,8 +13,9 @@ from collections import OrderedDict -def create_template(schema, output_name='template', output_format='all', main_sheet_name='main', - rollup=False, root_id=None, use_titles=False, **_): +def create_template(schema=None, output_name='template', output_format='all', main_sheet_name='main', + rollup=False, root_id=None, use_titles=False, + xml=False, xml_schemas=None, root_list_path=None, **_): """ Creates template file(s) from given inputs This function is built to deal with commandline input and arguments @@ -21,7 +23,10 @@ def create_template(schema, output_name='template', output_format='all', main_sh """ - parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles) + if xml: + parser = XMLSchemaParser(xml_schemas=xml_schemas, root_list_path=root_list_path) + else: + parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles) parser.parse() def spreadsheet_output(spreadsheet_output_class, name): diff --git a/flattentool/cli.py b/flattentool/cli.py index b0ed71ad..379161b1 100644 --- a/flattentool/cli.py +++ b/flattentool/cli.py @@ -36,10 +36,10 @@ def create_parser(): parser_create_template = subparsers.add_parser( 'create-template', help='Create a template from the given schema') - parser_create_template.add_argument( + schema_group = parser_create_template.add_mutually_exclusive_group(required=True) + schema_group.add_argument( "-s", "--schema", - help="Path to the schema file you want to use to create the template", - required=True) + help="Path to the schema file you want to use to create the template") parser_create_template.add_argument( "-f", "--output-format", help="Type of template you want to create. Defaults to all available options", @@ -61,6 +61,19 @@ def create_parser(): "--use-titles", action='store_true', help="Convert titles. Requires a schema to be specified.") + parser_create_template.add_argument( + "--xml", + action='store_true', + help="Use XML as the input format") + schema_group.add_argument( + "--xml-schema", + dest='xml_schemas', + metavar='XML_SCHEMA', + nargs='*', + help="Path to one or more XML schemas") + parser_create_template.add_argument( + "--root-list-path", + help="Path of the root list, defaults to main. Needed for XML template creation only.") parser_flatten = subparsers.add_parser( 'flatten', diff --git a/flattentool/sort_xml.py b/flattentool/sort_xml.py index 5c9c06be..dfd92554 100644 --- a/flattentool/sort_xml.py +++ b/flattentool/sort_xml.py @@ -69,7 +69,7 @@ def get_schema_element(self, tag_name, name_attribute): return schema_element return schema_element - def element_loop(self, element, path): + def element_loop(self, element): """ Return information about the children of the supplied element. """ @@ -95,14 +95,12 @@ def element_loop(self, element, path): 'xsd:complexType/xsd:all/xsd:element', namespaces=namespaces) + type_elements) - child_tuples = [] for child in children: a = child.attrib if 'name' in a: - child_tuples.append((a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs'))) + yield a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs') else: - child_tuples.append((a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs'))) - return child_tuples + yield a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs') def create_schema_dict(self, parent_name, parent_element=None): """ @@ -114,7 +112,7 @@ def create_schema_dict(self, parent_name, parent_element=None): return OrderedDict([ (name, self.create_schema_dict(name, element)) - for name, element, _, _, _ in self.element_loop(parent_element, '')]) + for name, element, _, _, _ in self.element_loop(parent_element)]) def sort_element(element, schema_subdict): diff --git a/flattentool/xml_create_template.py b/flattentool/xml_create_template.py new file mode 100644 index 00000000..081e180e --- /dev/null +++ b/flattentool/xml_create_template.py @@ -0,0 +1,109 @@ +import sys + +from .sort_xml import XMLSchemaWalker, namespaces +from .sheet import Sheet + + +class XMLSchemaWalkerForTemplate(XMLSchemaWalker): + def attribute_loop(self, element): + """ + Returns a list containing a tuple for each attribute the given element + can have. + The format of the tuple is (name, is_required) + """ + #if element.find("xsd:complexType[@mixed='true']", namespaces=namespaces) is not None: + # print_column_info('text', indent) + + a = element.attrib + type_attributes = [] + type_attributeGroups = [] + if 'type' in a: + complexType = self.get_schema_element('complexType', a['type']) + if complexType is not None: + type_attributes = ( + complexType.findall('xsd:attribute', namespaces=namespaces) + + complexType.findall('xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces) + ) + type_attributeGroups = ( + complexType.findall('xsd:attributeGroup', namespaces=namespaces) + + complexType.findall('xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces) + ) + + group_attributes = [] + for attributeGroup in ( + element.findall('xsd:complexType/xsd:attributeGroup', namespaces=namespaces) + + element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces) + + type_attributeGroups + ): + group_attributes += self.get_schema_element('attributeGroup', attributeGroup.attrib['ref']).findall('xsd:attribute', namespaces=namespaces) + + for attribute in ( + element.findall('xsd:complexType/xsd:attribute', namespaces=namespaces) + + element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces) + + type_attributes + group_attributes + ): + doc = attribute.find(".//xsd:documentation", namespaces=namespaces) + if 'ref' in attribute.attrib: + referenced_attribute = self.get_schema_element('attribute', attribute.get('ref')) + if referenced_attribute is not None: + attribute = referenced_attribute + if doc is None: + # Only fetch the documentation of the referenced definition + # if we don't already have documentation. + doc = attribute.find(".//xsd:documentation", namespaces=namespaces) + yield attribute.get('name') or attribute.get('ref'), attribute.get('use') == 'required' + + def has_simple_content(self, element): + a = element.attrib + simple_content = False + # we look up the type, and that has a simpleContent child + if 'type' in a: + complexType = self.get_schema_element('complexType', a['type']) + if complexType is not None: + simple_content = bool(complexType.findall('xsd:simpleContent', namespaces=namespaces)) + # or the compleType element here has a simpleContent child + simple_content = simple_content or bool(element.findall('xsd:complexType/xsd:simpleContent', namespaces=namespaces)) + # or there is only an annotation element + simple_content = simple_content or [child.tag for child in element] == ['{http://www.w3.org/2001/XMLSchema}annotation'] + return simple_content + + def generate_paths(self, parent_name, parent_element=None, parent_path=''): + if parent_element is None: + parent_element = self.get_schema_element('element', parent_name) + + for name, required, in self.attribute_loop(parent_element): + if name == 'xml:lang': + # Namespaces not supported yet https://github.com/OpenDataServices/flatten-tool/issues/148 + # And no way to specify two narrative elements anyway https://github.com/OpenDataServices/cove/issues/777 + continue + yield parent_path + '@' + name + + for name, element, _, minOccurs, maxOccurs in self.element_loop(parent_element): + if element is None: + element = self.get_schema_element('element', name) + path = parent_path + name + if self.has_simple_content(element): + yield path + if maxOccurs == 'unbounded' or int(maxOccurs) > 1: + path += '/0/' + else: + path += '/' + for child_path in self.generate_paths(name, element, path): + yield child_path + + +class XMLSchemaParser(object): + """Parse the fields of a JSON schema into a flattened structure.""" + + def __init__(self, xml_schemas=[], root_list_path=None): + self.sub_sheets = {} + self.main_sheet = Sheet() + self.sub_sheet_mapping = {} + self.xml_schemas = xml_schemas + assert root_list_path is not None + self.root_list_path = root_list_path + + def parse(self): + for path in XMLSchemaWalkerForTemplate(self.xml_schemas).generate_paths(self.root_list_path): + self.main_sheet.append(path) +