@@ -22,34 +22,87 @@ def __init__(self, unit_html: BeautifulSoup) -> None:
22
22
23
23
def parse (self ) -> t .Iterable [units .Unit ]:
24
24
"""
25
- In this method we only detect the headers. Parsing the actual content of each
26
- unit is done in the `on_header` method.
25
+ Parse the html content.
27
26
28
- This method is called recursively.
27
+ The dispatch method is called recursively by the child `on_header` method .
29
28
"""
30
- header_level = None
31
- if getattr (self .unit_html , "name" ):
32
- header_level = get_header_level (self .unit_html .name )
33
-
34
- # Parse html
35
- for unit in self .dispatch (self .unit_html .name , self .unit_html ):
36
- # Find the next header from which we start parsing again
37
- for next_html in self .unit_html .find_next_siblings ():
38
- if next_header_level := get_header_level (next_html .name ):
39
- if header_level is None :
40
- # Current unit did not have a header
41
- break
42
- if next_header_level == header_level + 1 :
43
- # Next level, create a child reader, parse
44
- child_reader = HtmlReader (next_html )
45
- for child in child_reader .parse ():
29
+ yield from self .iter_units (self .unit_html )
30
+
31
+ def iter_units (self , unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
32
+ yield from super ().dispatch (unit_html .name , unit_html )
33
+
34
+ def on_header (self , unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
35
+ """
36
+ Parse `<h1>, ...<h6>` DOM elements.
37
+
38
+ This method yields a single Collection for the current header. Headers from
39
+ level n+1 will be added as children, provided they are direct children.
40
+
41
+ This method is a little difficult to read. The problem with html headers is
42
+ that they break the concept of parent -> child inclusion. So children in the
43
+ sense of a course are actually siblings in the html world, and we need to figure
44
+ out which ones are direct children of the current unit.
45
+ """
46
+ header_level = get_header_level (unit_html .name )
47
+ assert header_level is not None
48
+
49
+ # Create collection unit
50
+ UnitClass = units .Course if header_level == 1 else units .Collection
51
+ unit : units .Collection = UnitClass (
52
+ attributes = get_data_attributes (unit_html ),
53
+ title = unit_html .string .strip (),
54
+ )
55
+
56
+ # Find children units.
57
+ siblings_are_children = True
58
+ for child_html in unit_html .find_next_siblings ():
59
+ if not getattr (child_html , "name" ):
60
+ # Ignore raw strings
61
+ continue
62
+ if child_header_level := get_header_level (child_html .name ):
63
+ # Header found: all other siblings are actually children of another unit
64
+ if child_header_level < header_level :
65
+ # Child is actually a parent header: stop searching for children
66
+ # Parent header will be parsed in the parent call.
67
+ break
68
+ elif child_header_level == header_level :
69
+ # Child is a header with the same level:
70
+ # Stop parsing and yield from a different parser.
71
+ break
72
+ elif child_header_level == header_level + 1 :
73
+ # Direct child -> will be appended to children
74
+ for child in self .iter_units (child_html ):
75
+ unit .add_child (child )
76
+ # Other siblings are no longer children of this unit
77
+ siblings_are_children = False
78
+ else :
79
+ # Child is a grand-child, so we ignore it
80
+ continue
81
+ else :
82
+ if siblings_are_children :
83
+ # Found a non-header unit: append to children
84
+ # (and concatenate RawHtml units in the process)
85
+ for child in self .iter_units (child_html ):
86
+ if (
87
+ unit .children
88
+ and isinstance (unit .children [- 1 ], units .RawHtml )
89
+ and isinstance (child , units .RawHtml )
90
+ ):
91
+ # Concatenate all RawHtml children
92
+ unit .children [- 1 ].concatenate (child )
93
+ else :
94
+ # Append child
46
95
unit .add_child (child )
47
- elif next_header_level <= header_level :
48
- # We found a header with the same level or a parent unit. All
49
- # subsequent items belong to it. We stop parsing.
50
- break
51
- # Unit is yielded after we have added its children
52
- yield unit
96
+
97
+ # Yield current unit
98
+ yield unit
99
+
100
+ on_h1 = on_header
101
+ on_h2 = on_header
102
+ on_h3 = on_header
103
+ on_h4 = on_header
104
+ on_h5 = on_header
105
+ on_h6 = on_header
53
106
54
107
def on_section (self , unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
55
108
"""
@@ -70,61 +123,13 @@ def on_section(self, unit_html: BeautifulSoup) -> t.Iterable[units.Unit]:
70
123
else :
71
124
logger .warning ("Unit type is unsupported by HTML reader: %s" , unit_type )
72
125
73
- def on_header (self , unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
74
- """
75
- Parse `<h1>, ...<h6>` DOM elements.
76
- """
77
- # Create unit
78
- UnitClass = units .Course if unit_html .name == "h1" else units .Unit
79
- attributes = {
80
- k [5 :]: v for k , v in unit_html .attrs .items () if k .startswith ("data-" )
81
- }
82
- unit : units .Unit = UnitClass (attributes , title = self .unit_html .string .strip ())
83
-
84
- # Find children
85
- children = []
86
- for child_html in unit_html .find_next_siblings ():
87
- if not getattr (child_html , "name" ):
88
- # Skip raw string
89
- continue
90
- elif get_header_level (child_html .name ) is not None :
91
- # Child is a header: stop processing
92
- break
93
- for child in self .dispatch (child_html .name , child_html ):
94
- children .append (child )
95
-
96
- for child in children :
97
- if (
98
- isinstance (child , units .RawHtml )
99
- and unit .children
100
- and isinstance (unit .children [- 1 ], units .RawHtml )
101
- ):
102
- # Concatenate all RawHtml children
103
- unit .children [- 1 ].concatenate (child )
104
- else :
105
- # Append child
106
- unit .add_child (child )
107
-
108
- yield unit
109
-
110
- on_h1 = on_header
111
- on_h2 = on_header
112
- on_h3 = on_header
113
- on_h4 = on_header
114
- on_h5 = on_header
115
- on_h6 = on_header
116
-
117
126
def _on_html (self , unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
118
127
"""
119
128
All data-* attributes are copied to the RawHtml unit.
120
129
"""
121
130
yield units .RawHtml (
122
131
contents = str (unit_html ),
123
- attributes = {
124
- key : value
125
- for key , value in unit_html .attrs .items ()
126
- if key .startswith ("data-" )
127
- },
132
+ attributes = get_data_attributes (unit_html ),
128
133
)
129
134
130
135
# Add here all html elements that should be converted to RawHtml
@@ -179,6 +184,17 @@ def get_header_level(h: str) -> t.Optional[int]:
179
184
return int (match .group (1 ))
180
185
181
186
187
+ def get_data_attributes (unit_html : BeautifulSoup ) -> t .Dict [str , str ]:
188
+ """
189
+ Return all attributes that start with "data-"
190
+ """
191
+ return {
192
+ key [5 :]: value
193
+ for key , value in unit_html .attrs .items ()
194
+ if key .startswith ("data-" )
195
+ }
196
+
197
+
182
198
def process_mcq (unit_html : BeautifulSoup ) -> t .Iterable [units .Unit ]:
183
199
"""
184
200
<ul> tags may contain multiple choice questions. In such cases, the first <li>
0 commit comments