Skip to content

Commit a9005d2

Browse files
committed
refactor: collection units
Not every unit should have children. It's much simpler and cleaner if only collection units have children.
1 parent a350133 commit a9005d2

File tree

8 files changed

+173
-123
lines changed

8 files changed

+173
-123
lines changed

mu/formats/base/writer.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ def write(self, unit: units.Unit) -> "BaseWriter":
1111
on_func(unit)
1212

1313
# Write children recursively: depth-first traversal
14-
for child in unit.children:
15-
self.write(child)
14+
if isinstance(unit, units.Collection):
15+
for child in unit.children:
16+
self.write(child)
1617

1718
return self
1819

19-
def on_unit(self, unit: units.Unit) -> None:
20+
def on_collection(self, unit: units.Collection) -> None:
2021
pass
2122

2223
def on_course(self, unit: units.Course) -> None:

mu/formats/html/reader.py

+90-74
Original file line numberDiff line numberDiff line change
@@ -22,34 +22,87 @@ def __init__(self, unit_html: BeautifulSoup) -> None:
2222

2323
def parse(self) -> t.Iterable[units.Unit]:
2424
"""
25-
In this method we only detect the headers. Parsing the actual content of each
26-
unit is done in the `on_header` method.
25+
Parse the html content.
2726
28-
This method is called recursively.
27+
The dispatch method is called recursively by the child `on_header` method.
2928
"""
30-
header_level = None
31-
if getattr(self.unit_html, "name"):
32-
header_level = get_header_level(self.unit_html.name)
33-
34-
# Parse html
35-
for unit in self.dispatch(self.unit_html.name, self.unit_html):
36-
# Find the next header from which we start parsing again
37-
for next_html in self.unit_html.find_next_siblings():
38-
if next_header_level := get_header_level(next_html.name):
39-
if header_level is None:
40-
# Current unit did not have a header
41-
break
42-
if next_header_level == header_level + 1:
43-
# Next level, create a child reader, parse
44-
child_reader = HtmlReader(next_html)
45-
for child in child_reader.parse():
29+
yield from self.iter_units(self.unit_html)
30+
31+
def iter_units(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
32+
yield from super().dispatch(unit_html.name, unit_html)
33+
34+
def on_header(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
35+
"""
36+
Parse `<h1>, ...<h6>` DOM elements.
37+
38+
This method yields a single Collection for the current header. Headers from
39+
level n+1 will be added as children, provided they are direct children.
40+
41+
This method is a little difficult to read. The problem with html headers is
42+
that they break the concept of parent -> child inclusion. So children in the
43+
sense of a course are actually siblings in the html world, and we need to figure
44+
out which ones are direct children of the current unit.
45+
"""
46+
header_level = get_header_level(unit_html.name)
47+
assert header_level is not None
48+
49+
# Create collection unit
50+
UnitClass = units.Course if header_level == 1 else units.Collection
51+
unit: units.Collection = UnitClass(
52+
attributes=get_data_attributes(unit_html),
53+
title=unit_html.string.strip(),
54+
)
55+
56+
# Find children units.
57+
siblings_are_children = True
58+
for child_html in unit_html.find_next_siblings():
59+
if not getattr(child_html, "name"):
60+
# Ignore raw strings
61+
continue
62+
if child_header_level := get_header_level(child_html.name):
63+
# Header found: all other siblings are actually children of another unit
64+
if child_header_level < header_level:
65+
# Child is actually a parent header: stop searching for children
66+
# Parent header will be parsed in the parent call.
67+
break
68+
elif child_header_level == header_level:
69+
# Child is a header with the same level:
70+
# Stop parsing and yield from a different parser.
71+
break
72+
elif child_header_level == header_level + 1:
73+
# Direct child -> will be appended to children
74+
for child in self.iter_units(child_html):
75+
unit.add_child(child)
76+
# Other siblings are no longer children of this unit
77+
siblings_are_children = False
78+
else:
79+
# Child is a grand-child, so we ignore it
80+
continue
81+
else:
82+
if siblings_are_children:
83+
# Found a non-header unit: append to children
84+
# (and concatenate RawHtml units in the process)
85+
for child in self.iter_units(child_html):
86+
if (
87+
unit.children
88+
and isinstance(unit.children[-1], units.RawHtml)
89+
and isinstance(child, units.RawHtml)
90+
):
91+
# Concatenate all RawHtml children
92+
unit.children[-1].concatenate(child)
93+
else:
94+
# Append child
4695
unit.add_child(child)
47-
elif next_header_level <= header_level:
48-
# We found a header with the same level or a parent unit. All
49-
# subsequent items belong to it. We stop parsing.
50-
break
51-
# Unit is yielded after we have added its children
52-
yield unit
96+
97+
# Yield current unit
98+
yield unit
99+
100+
on_h1 = on_header
101+
on_h2 = on_header
102+
on_h3 = on_header
103+
on_h4 = on_header
104+
on_h5 = on_header
105+
on_h6 = on_header
53106

54107
def on_section(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
55108
"""
@@ -70,61 +123,13 @@ def on_section(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
70123
else:
71124
logger.warning("Unit type is unsupported by HTML reader: %s", unit_type)
72125

73-
def on_header(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
74-
"""
75-
Parse `<h1>, ...<h6>` DOM elements.
76-
"""
77-
# Create unit
78-
UnitClass = units.Course if unit_html.name == "h1" else units.Unit
79-
attributes = {
80-
k[5:]: v for k, v in unit_html.attrs.items() if k.startswith("data-")
81-
}
82-
unit: units.Unit = UnitClass(attributes, title=self.unit_html.string.strip())
83-
84-
# Find children
85-
children = []
86-
for child_html in unit_html.find_next_siblings():
87-
if not getattr(child_html, "name"):
88-
# Skip raw string
89-
continue
90-
elif get_header_level(child_html.name) is not None:
91-
# Child is a header: stop processing
92-
break
93-
for child in self.dispatch(child_html.name, child_html):
94-
children.append(child)
95-
96-
for child in children:
97-
if (
98-
isinstance(child, units.RawHtml)
99-
and unit.children
100-
and isinstance(unit.children[-1], units.RawHtml)
101-
):
102-
# Concatenate all RawHtml children
103-
unit.children[-1].concatenate(child)
104-
else:
105-
# Append child
106-
unit.add_child(child)
107-
108-
yield unit
109-
110-
on_h1 = on_header
111-
on_h2 = on_header
112-
on_h3 = on_header
113-
on_h4 = on_header
114-
on_h5 = on_header
115-
on_h6 = on_header
116-
117126
def _on_html(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
118127
"""
119128
All data-* attributes are copied to the RawHtml unit.
120129
"""
121130
yield units.RawHtml(
122131
contents=str(unit_html),
123-
attributes={
124-
key: value
125-
for key, value in unit_html.attrs.items()
126-
if key.startswith("data-")
127-
},
132+
attributes=get_data_attributes(unit_html),
128133
)
129134

130135
# Add here all html elements that should be converted to RawHtml
@@ -179,6 +184,17 @@ def get_header_level(h: str) -> t.Optional[int]:
179184
return int(match.group(1))
180185

181186

187+
def get_data_attributes(unit_html: BeautifulSoup) -> t.Dict[str, str]:
188+
"""
189+
Return all attributes that start with "data-"
190+
"""
191+
return {
192+
key[5:]: value
193+
for key, value in unit_html.attrs.items()
194+
if key.startswith("data-")
195+
}
196+
197+
182198
def process_mcq(unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
183199
"""
184200
<ul> tags may contain multiple choice questions. In such cases, the first <li>

mu/formats/html/writer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def get_header(self, unit: units.Unit) -> Tag:
3838
tag.string = title
3939
return tag
4040

41-
def on_unit(self, unit: units.Unit) -> None:
41+
def on_collection(self, unit: units.Collection) -> None:
4242
self.append_to_body(self.get_header(unit))
4343

4444
def on_course(self, unit: units.Course) -> None:

mu/formats/olx/reader.py

+27-22
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,38 @@ def parse(self) -> t.Iterable[units.Unit]:
2828
return
2929

3030
# Dispatch call to on_* functions
31-
for unit in self.dispatch(self.unit_xml.name, self.unit_xml):
32-
# Parse children
33-
for child_xml in self.unit_xml.children:
34-
reader = self.get_child_reader(child_xml)
35-
for child in reader.parse():
36-
unit.add_child(child)
37-
yield unit
31+
yield from self.dispatch(self.unit_xml.name, self.unit_xml)
32+
33+
def parse_children(self) -> t.Iterable[units.Unit]:
34+
# Parse children
35+
for child_xml in self.unit_xml.children:
36+
reader = self.get_child_reader(child_xml)
37+
yield from reader.parse()
38+
39+
def _on_collection(
40+
self, unit_xml: BeautifulSoup, collection: t.Optional[units.Collection] = None
41+
) -> t.Iterable[units.Unit]:
42+
"""
43+
Dispatch function for course, chapter, sequential and vertical units.
44+
"""
45+
if collection is None:
46+
collection = units.Collection(
47+
get_unit_attributes(unit_xml),
48+
title=unit_xml.attrs.get("display_name", ""),
49+
)
50+
for child in self.parse_children():
51+
collection.add_child(child)
52+
yield collection
3853

3954
def on_course(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
40-
yield units.Course(
55+
course = units.Course(
4156
get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
4257
)
58+
yield from self._on_collection(unit_xml, course)
4359

44-
def on_chapter(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
45-
yield units.Unit(
46-
get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
47-
)
60+
on_chapter = _on_collection
61+
on_sequential = _on_collection
62+
on_vertical = _on_collection
4863

4964
def on_problem(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
5065
"""
@@ -80,16 +95,6 @@ def on_problem(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
8095
answers=ftq_answers,
8196
)
8297

83-
def on_sequential(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
84-
yield units.Unit(
85-
get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
86-
)
87-
88-
def on_vertical(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
89-
yield units.Unit(
90-
get_unit_attributes(unit_xml), title=unit_xml.attrs.get("display_name", "")
91-
)
92-
9398
def on_html(self, unit_xml: BeautifulSoup) -> t.Iterable[units.Unit]:
9499
"""
95100
https://edx.readthedocs.io/projects/edx-open-learning-xml/en/latest/components/html-components.html

mu/formats/olx/writer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def write_to(self, path: str) -> None:
2525
# Write all xml files
2626
write_xml(unit_xml, os.path.join(path, unit_path), makedirs=True)
2727

28-
def on_unit(self, unit: units.Unit) -> None:
28+
def on_collection(self, unit: units.Collection) -> None:
2929
self.process_top_level_unit(unit)
3030

3131
def on_course(self, unit: units.Course) -> None:

mu/units.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,56 @@
11
import typing as t
22

3+
U = t.TypeVar("U", bound="Unit")
4+
35

46
class Unit:
57
"""
68
A generic course unit.
79
8-
A course unit is a tree structure, where each element can have arbitrary attributes.
9-
All units also have a title and key/value attributes.
10+
All units have an optional title and key/value attributes.
11+
12+
Courses follow a tree structure. Units which are not containers are terminal leaves.
13+
Every unit (except the top-level one) has a parent which is an instance of a
14+
Collection.
1015
"""
1116

1217
def __init__(
1318
self, attributes: t.Optional[t.Dict[str, str]] = None, title: str = ""
1419
):
1520
self.attributes = attributes or {}
16-
self.children: t.List[Unit] = []
17-
self.parent: t.Optional[Unit] = None
21+
self.parent: t.Optional["Collection"] = None
1822
self.title = title
1923

20-
def add_child(self, unit: "Unit") -> "Unit":
21-
unit.parent = self
22-
self.children.append(unit)
23-
return unit
24-
2524
@property
2625
def depth(self) -> int:
2726
if self.parent is None:
2827
return 0
2928
return self.parent.depth + 1
3029

3130

32-
class Course(Unit):
31+
class Collection(Unit):
32+
"""
33+
A special type of Unit which can include children units.
34+
"""
35+
36+
def __init__(
37+
self, attributes: t.Optional[t.Dict[str, str]] = None, title: str = ""
38+
):
39+
super().__init__(attributes=attributes, title=title)
40+
self.children: t.List[Unit] = []
41+
42+
def add_child(self, unit: U) -> U:
43+
unit.parent = self
44+
self.children.append(unit)
45+
return unit
46+
47+
48+
class Course(Collection):
3349
"""
3450
Top-level element of a course.
51+
52+
For now there is nothing special about this unit, but we may add extra properties in
53+
the future.
3554
"""
3655

3756

0 commit comments

Comments
 (0)