refactor: collection units

regisb · regisb · commit a9005d261e28 · 2023-02-06T20:11:14.000+01:00
Not every unit should have children. It's much simpler and cleaner if
only collection units have children.
diff --git a/mu/formats/base/writer.py b/mu/formats/base/writer.py
@@ -11,12 +11,13 @@ def write(self, unit: units.Unit) -> "BaseWriter":
         on_func(unit)
 
         # Write children recursively: depth-first traversal
-        for child in unit.children:
-            self.write(child)
+        if isinstance(unit, units.Collection):
+            for child in unit.children:
+                self.write(child)
 
         return self
 
-    def on_unit(self, unit: units.Unit) -> None:
+    def on_collection(self, unit: units.Collection) -> None:
         pass
 
     def on_course(self, unit: units.Course) -> None:
diff --git a/mu/formats/html/reader.py b/mu/formats/html/reader.py
@@ -22,34 +22,87 @@ def __init__(self, unit_html: BeautifulSoup) -> None:
 
     def parse(self) -> t.Iterable[units.Unit]:
         """
-        In this method we only detect the headers. Parsing the actual content of each
-        unit is done in the `on_header` method.
+        Parse the html content.
 
-        This method is called recursively.
+        The dispatch method is called recursively by the child `on_header` method.
         """
-        header_level = None
-        if getattr(self.unit_html, "name"):
-            header_level = get_header_level(self.unit_html.name)
-
-        # Parse html
-        for unit in self.dispatch(self.unit_html.name, self.unit_html):
-            # Find the next header from which we start parsing again
-            for next_html in self.unit_html.find_next_siblings():
-                if next_header_level := get_header_level(next_html.name):
-                    if header_level is None:
-                        # Current unit did not have a header
-                        break
-                    if next_header_level == header_level + 1:
-                        # Next level, create a child reader, parse
-                        child_reader = HtmlReader(next_html)
-                        for child in child_reader.parse():
+        yield from self.iter_units(self.unit_html)
+
+    def iter_units(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
+        yield from super().dispatch(unit_html.name, unit_html)
+
+    def on_header(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
+        """
+        Parse `<h1>, ...<h6>` DOM elements.
+
+        This method yields a single Collection for the current header. Headers from
+        level n+1 will be added as children, provided they are direct children.
+
+        This method is a little difficult to read. The problem with html headers is
+        that they break the concept of parent -> child inclusion. So children in the
+        sense of a course are actually siblings in the html world, and we need to figure
+        out which ones are direct children of the current unit.
+        """
+        header_level = get_header_level(unit_html.name)
+        assert header_level is not None
+
+        # Create collection unit
+        UnitClass = units.Course if header_level == 1 else units.Collection
+        unit: units.Collection = UnitClass(
+            attributes=get_data_attributes(unit_html),
+            title=unit_html.string.strip(),
+        )
+
+        # Find children units.
+        siblings_are_children = True
+        for child_html in unit_html.find_next_siblings():
+            if not getattr(child_html, "name"):
+                # Ignore raw strings
+                continue
+            if child_header_level := get_header_level(child_html.name):
+                # Header found: all other siblings are actually children of another unit
+                if child_header_level < header_level:
+                    # Child is actually a parent header: stop searching for children
+                    # Parent header will be parsed in the parent call.
+                    break
+                elif child_header_level == header_level:
+                    # Child is a header with the same level:
+                    # Stop parsing and yield from a different parser.
+                    break
+                elif child_header_level == header_level + 1:
+                    # Direct child -> will be appended to children
+                    for child in self.iter_units(child_html):
+                        unit.add_child(child)
+                    # Other siblings are no longer children of this unit
+                    siblings_are_children = False
+                else:
+                    # Child is a grand-child, so we ignore it
+                    continue
+            else:
+                if siblings_are_children:
+                    # Found a non-header unit: append to children
+                    # (and concatenate RawHtml units in the process)
+                    for child in self.iter_units(child_html):
+                        if (
+                            unit.children
+                            and isinstance(unit.children[-1], units.RawHtml)
+                            and isinstance(child, units.RawHtml)
+                        ):
+                            # Concatenate all RawHtml children
+                            unit.children[-1].concatenate(child)
+                        else:
+                            # Append child
                             unit.add_child(child)
-                    elif next_header_level <= header_level:
-                        # We found a header with the same level or a parent unit. All
-                        # subsequent items belong to it. We stop parsing.
-                        break
-            # Unit is yielded after we have added its children
-            yield unit
+
+        # Yield current unit
+        yield unit
+
+    on_h1 = on_header
+    on_h2 = on_header
+    on_h3 = on_header
+    on_h4 = on_header
+    on_h5 = on_header
+    on_h6 = on_header
 
     def on_section(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
         """
@@ -70,61 +123,13 @@ def on_section(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
         else:
             logger.warning("Unit type is unsupported by HTML reader: %s", unit_type)
 
-    def on_header(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
-        """
-        Parse `<h1>, ...<h6>` DOM elements.
-        """
-        # Create unit
-        UnitClass = units.Course if unit_html.name == "h1" else units.Unit
-        attributes = {
-            k[5:]: v for k, v in unit_html.attrs.items() if k.startswith("data-")
-        }
-        unit: units.Unit = UnitClass(attributes, title=self.unit_html.string.strip())
-
-        # Find children
-        children = []
-        for child_html in unit_html.find_next_siblings():
-            if not getattr(child_html, "name"):
-                # Skip raw string
-                continue
-            elif get_header_level(child_html.name) is not None:
-                # Child is a header: stop processing
-                break
-            for child in self.dispatch(child_html.name, child_html):
-                children.append(child)
-
-        for child in children:
-            if (
-                isinstance(child, units.RawHtml)
-                and unit.children
-                and isinstance(unit.children[-1], units.RawHtml)
-            ):
-                # Concatenate all RawHtml children
-                unit.children[-1].concatenate(child)
-            else:
-                # Append child
-                unit.add_child(child)
-
-        yield unit
-
-    on_h1 = on_header
-    on_h2 = on_header
-    on_h3 = on_header
-    on_h4 = on_header
-    on_h5 = on_header
-    on_h6 = on_header
-
     def _on_html(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
         """
         All data-* attributes are copied to the RawHtml unit.
         """
         yield units.RawHtml(
             contents=str(unit_html),
-            attributes={
-                key: value
-                for key, value in unit_html.attrs.items()
-                if key.startswith("data-")
-            },
+            attributes=get_data_attributes(unit_html),
         )
 
     # Add here all html elements that should be converted to RawHtml
@@ -179,6 +184,17 @@ def get_header_level(h: str) -> t.Optional[int]:
     return int(match.group(1))
 
 
+def get_data_attributes(unit_html: BeautifulSoup) -> t.Dict[str, str]:
+    """
+    Return all attributes that start with "data-"
+    """
+    return {
+        key[5:]: value
+        for key, value in unit_html.attrs.items()
+        if key.startswith("data-")
+    }
+
+
 def process_mcq(unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
     """
     <ul> tags may contain multiple choice questions. In such cases, the first <li>
diff --git a/mu/formats/html/writer.py b/mu/formats/html/writer.py
@@ -38,7 +38,7 @@ def get_header(self, unit: units.Unit) -> Tag:
             tag.string = title
         return tag
 
-    def on_unit(self, unit: units.Unit) -> None:
+    def on_collection(self, unit: units.Collection) -> None:
         self.append_to_body(self.get_header(unit))
 
     def on_course(self, unit: units.Course) -> None:
diff --git a/mu/formats/olx/reader.py b/mu/formats/olx/reader.py
@@ -28,23 +28,38 @@ def parse(self) -> t.Iterable[units.Unit]:
             return
 
         # Dispatch call to on_* functions
-        for unit in self.dispatch(self.unit_xml.name, self.unit_xml):
-            # Parse children
-            for child_xml in self.unit_xml.children:
-                reader = self.get_child_reader(child_xml)
-                for child in reader.parse():
-                    unit.add_child(child)
-            yield unit
+        yield from self.dispatch(self.unit_xml.name, self.unit_xml)
+
+    def parse_children(self) -> t.Iterable[units.Unit]:
+        # Parse children
+        for child_xml in self.unit_xml.children:
+            reader = self.get_child_reader(child_xml)
+            yield from reader.parse()
+
+    def _on_collection(
+        self, unit_xml: BeautifulSoup, collection: t.Optional[units.Collection] = None
+    ) -> t.Iterable[units.Unit]:
+        """
+        Dispatch function for course, chapter, sequential and vertical units.
+        """
+        if collection is None:
+            collection = units.Collection(
+                get_unit_attributes(unit_xml),
+                title=unit_xml.attrs.get("display_name", ""),
+            )
+        for child in self.parse_children():
+            collection.add_child(child)
+        yield collection
 
     def on_course(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
-        yield units.Course(
+        course = units.Course(
             get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
         )
+        yield from self._on_collection(unit_xml, course)
 
-    def on_chapter(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
-        yield units.Unit(
-            get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
-        )
+    on_chapter = _on_collection
+    on_sequential = _on_collection
+    on_vertical = _on_collection
 
     def on_problem(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
         """
@@ -80,16 +95,6 @@ def on_problem(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
                 answers=ftq_answers,
             )
 
-    def on_sequential(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
-        yield units.Unit(
-            get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
-        )
-
-    def on_vertical(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
-        yield units.Unit(
-            get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
-        )
-
     def on_html(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
         """
         https://edx.readthedocs.io/projects/edx-open-learning-xml/en/latest/components/html-components.html
diff --git a/mu/formats/olx/writer.py b/mu/formats/olx/writer.py
@@ -25,7 +25,7 @@ def write_to(self, path: str) -> None:
             # Write all xml files
             write_xml(unit_xml, os.path.join(path, unit_path), makedirs=True)
 
-    def on_unit(self, unit: units.Unit) -> None:
+    def on_collection(self, unit: units.Collection) -> None:
         self.process_top_level_unit(unit)
 
     def on_course(self, unit: units.Course) -> None:
diff --git a/mu/units.py b/mu/units.py
@@ -1,37 +1,56 @@
 import typing as t
 
+U = t.TypeVar("U", bound="Unit")
+
 
 class Unit:
     """
     A generic course unit.
 
-    A course unit is a tree structure, where each element can have arbitrary attributes.
-    All units also have a title and key/value attributes.
+    All units have an optional title and key/value attributes.
+
+    Courses follow a tree structure. Units which are not containers are terminal leaves.
+    Every unit (except the top-level one) has a parent which is an instance of a
+    Collection.
     """
 
     def __init__(
         self, attributes: t.Optional[t.Dict[str, str]] = None, title: str = ""
     ):
         self.attributes = attributes or {}
-        self.children: t.List[Unit] = []
-        self.parent: t.Optional[Unit] = None
+        self.parent: t.Optional["Collection"] = None
         self.title = title
 
-    def add_child(self, unit: "Unit") -> "Unit":
-        unit.parent = self
-        self.children.append(unit)
-        return unit
-
     @property
     def depth(self) -> int:
         if self.parent is None:
             return 0
         return self.parent.depth + 1
 
 
-class Course(Unit):
+class Collection(Unit):
+    """
+    A special type of Unit which can include children units.
+    """
+
+    def __init__(
+        self, attributes: t.Optional[t.Dict[str, str]] = None, title: str = ""
+    ):
+        super().__init__(attributes=attributes, title=title)
+        self.children: t.List[Unit] = []
+
+    def add_child(self, unit: U) -> U:
+        unit.parent = self
+        self.children.append(unit)
+        return unit
+
+
+class Course(Collection):
     """
     Top-level element of a course.
+
+    For now there is nothing special about this unit, but we may add extra properties in
+    the future.
     """
 
 
diff --git a/tests/test_html.py b/tests/test_html.py
diff --git a/tests/test_olx.py b/tests/test_olx.py