|
geoffers@34
|
1 |
# coding=UTF-8
|
|
geoffers@34
|
2 |
# Copyright (c) 2008 Geoffrey Sneddon
|
|
geoffers@262
|
3 |
#
|
|
geoffers@34
|
4 |
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
geoffers@34
|
5 |
# of this software and associated documentation files (the "Software"), to deal
|
|
geoffers@34
|
6 |
# in the Software without restriction, including without limitation the rights
|
|
geoffers@34
|
7 |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
geoffers@34
|
8 |
# copies of the Software, and to permit persons to whom the Software is
|
|
geoffers@34
|
9 |
# furnished to do so, subject to the following conditions:
|
|
geoffers@262
|
10 |
#
|
|
geoffers@34
|
11 |
# The above copyright notice and this permission notice shall be included in
|
|
geoffers@34
|
12 |
# all copies or substantial portions of the Software.
|
|
geoffers@262
|
13 |
#
|
|
geoffers@34
|
14 |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
geoffers@34
|
15 |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
geoffers@34
|
16 |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
geoffers@34
|
17 |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
geoffers@34
|
18 |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
geoffers@34
|
19 |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
geoffers@34
|
20 |
# THE SOFTWARE.
|
|
geoffers@34
|
21 |
|
|
geoffers@34
|
22 |
from lxml import etree
|
|
geoffers@34
|
23 |
|
|
geoffers@252
|
24 |
from anolislib import utils
|
|
geoffers@34
|
25 |
|
|
geoffers@40
|
26 |
# Rank of heading elements (these are negative so h1 > h6)
|
|
geoffers@262
|
27 |
rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6,
|
|
geoffers@262
|
28 |
u"header": -1}
|
|
geoffers@34
|
29 |
|
|
geoffers@263
|
30 |
|
|
geoffers@39
|
31 |
class section(list):
|
|
geoffers@262
|
32 |
"""Represents the section of a document."""
|
|
geoffers@39
|
33 |
|
|
geoffers@262
|
34 |
header = None
|
|
geoffers@262
|
35 |
|
|
geoffers@262
|
36 |
def __repr__(self):
|
|
geoffers@262
|
37 |
return "<section %s>" % (repr(self.header))
|
|
geoffers@262
|
38 |
|
|
geoffers@262
|
39 |
def append(self, child):
|
|
geoffers@262
|
40 |
list.append(self, child)
|
|
geoffers@262
|
41 |
child.parent = self
|
|
geoffers@262
|
42 |
|
|
geoffers@262
|
43 |
def extend(self, children):
|
|
geoffers@262
|
44 |
list.extend(self, children)
|
|
geoffers@262
|
45 |
for child in children:
|
|
geoffers@262
|
46 |
child.parent = self
|
|
geoffers@34
|
47 |
|
|
geoffers@263
|
48 |
|
|
geoffers@41
|
49 |
class Outliner:
|
|
geoffers@262
|
50 |
"""Build the outline of an HTML document."""
|
|
geoffers@262
|
51 |
|
|
geoffers@262
|
52 |
def __init__(self, ElementTree, **kwargs):
|
|
geoffers@262
|
53 |
self.ElementTree = ElementTree
|
|
geoffers@262
|
54 |
self.stack = []
|
|
geoffers@262
|
55 |
self.outlines = {}
|
|
geoffers@262
|
56 |
self.current_outlinee = None
|
|
geoffers@262
|
57 |
self.current_section = None
|
|
geoffers@262
|
58 |
|
|
geoffers@262
|
59 |
def build(self, **kwargs):
|
|
geoffers@262
|
60 |
for action, element in etree.iterwalk(self.ElementTree,
|
|
geoffers@262
|
61 |
events=("start", "end")):
|
|
geoffers@262
|
62 |
# If the top of the stack is an element, and you are exiting that
|
|
geoffers@262
|
63 |
# element
|
|
geoffers@262
|
64 |
if action == "end" and self.stack and self.stack[-1] == element:
|
|
geoffers@262
|
65 |
# Note: The element being exited is a heading content element.
|
|
geoffers@262
|
66 |
assert element.tag in utils.heading_content
|
|
geoffers@262
|
67 |
# Pop that element from the stack.
|
|
geoffers@262
|
68 |
self.stack.pop()
|
|
geoffers@262
|
69 |
|
|
geoffers@262
|
70 |
# If the top of the stack is a heading content element
|
|
geoffers@262
|
71 |
elif self.stack and self.stack[-1].tag in utils.heading_content:
|
|
geoffers@262
|
72 |
# Do nothing.
|
|
geoffers@262
|
73 |
pass
|
|
geoffers@262
|
74 |
|
|
geoffers@262
|
75 |
# When entering a sectioning content element or a sectioning root
|
|
geoffers@262
|
76 |
# element
|
|
geoffers@262
|
77 |
elif action == "start" and \
|
|
geoffers@262
|
78 |
(element.tag in utils.sectioning_content or \
|
|
geoffers@262
|
79 |
element.tag in utils.sectioning_root):
|
|
geoffers@262
|
80 |
# If current outlinee is not null, push current outlinee onto
|
|
geoffers@262
|
81 |
# the stack.
|
|
geoffers@262
|
82 |
if self.current_outlinee is not None:
|
|
geoffers@262
|
83 |
self.stack.append(self.current_outlinee)
|
|
geoffers@262
|
84 |
# Let current outlinee be the element that is being entered.
|
|
geoffers@262
|
85 |
self.current_outlinee = element
|
|
geoffers@262
|
86 |
# Let current section be a newly created section for the
|
|
geoffers@262
|
87 |
# current outlinee element.
|
|
geoffers@262
|
88 |
self.current_section = section()
|
|
geoffers@262
|
89 |
# Let there be a new outline for the new current outlinee,
|
|
geoffers@262
|
90 |
# initialized with just the new current section as the only
|
|
geoffers@262
|
91 |
# section in the outline.
|
|
geoffers@262
|
92 |
self.outlines[self.current_outlinee] = [self.current_section]
|
|
geoffers@262
|
93 |
|
|
geoffers@262
|
94 |
# When exiting a sectioning content element, if the stack is not
|
|
geoffers@262
|
95 |
# empty
|
|
geoffers@262
|
96 |
elif action == "end" and \
|
|
geoffers@262
|
97 |
element.tag in utils.sectioning_content and self.stack:
|
|
geoffers@262
|
98 |
# Pop the top element from the stack, and let the current
|
|
geoffers@262
|
99 |
# outlinee be that element.
|
|
geoffers@262
|
100 |
self.current_outlinee = self.stack.pop()
|
|
geoffers@262
|
101 |
# Let current section be the last section in the outline of the
|
|
geoffers@262
|
102 |
# current outlinee element.
|
|
geoffers@262
|
103 |
self.current_section = self.outlines[self.current_outlinee][-1]
|
|
geoffers@262
|
104 |
# Append the outline of the sectioning content element being
|
|
geoffers@262
|
105 |
# exited to the current section. (This does not change which
|
|
geoffers@262
|
106 |
# section is the last section in the outline.)
|
|
geoffers@262
|
107 |
self.current_section += self.outlines[element]
|
|
geoffers@262
|
108 |
|
|
geoffers@262
|
109 |
# When exiting a sectioning root element, if the stack is not empty
|
|
geoffers@262
|
110 |
elif action == "end" and element.tag in utils.sectioning_root and \
|
|
geoffers@262
|
111 |
self.stack:
|
|
geoffers@262
|
112 |
# Pop the top element from the stack, and let the current
|
|
geoffers@262
|
113 |
# outlinee be that element.
|
|
geoffers@262
|
114 |
self.current_outlinee = self.stack.pop()
|
|
geoffers@262
|
115 |
# Let current section be the last section in the outline of the
|
|
geoffers@262
|
116 |
# current outlinee element.
|
|
geoffers@262
|
117 |
self.current_section = self.outlines[self.current_outlinee][-1]
|
|
geoffers@262
|
118 |
# Loop: If current section has no child sections, stop these
|
|
geoffers@262
|
119 |
# steps.
|
|
geoffers@262
|
120 |
while self.current_section:
|
|
geoffers@262
|
121 |
# Let current section be the last child section of the
|
|
geoffers@262
|
122 |
# current current section.
|
|
geoffers@262
|
123 |
assert self.current_section != self.current_section[-1]
|
|
geoffers@262
|
124 |
self.current_section = self.current_section[-1]
|
|
geoffers@262
|
125 |
# Go back to the substep labeled Loop.
|
|
geoffers@262
|
126 |
|
|
geoffers@262
|
127 |
# When exiting a sectioning content element or a sectioning root
|
|
geoffers@262
|
128 |
# element
|
|
geoffers@262
|
129 |
elif action == "end" and \
|
|
geoffers@262
|
130 |
(element.tag in utils.sectioning_content or \
|
|
geoffers@262
|
131 |
element.tag in utils.sectioning_root):
|
|
geoffers@262
|
132 |
# Note: The current outlinee is the element being exited.
|
|
geoffers@262
|
133 |
assert self.current_outlinee == element
|
|
geoffers@262
|
134 |
# Let current section be the first section in the outline of
|
|
geoffers@262
|
135 |
# the current outlinee element.
|
|
geoffers@262
|
136 |
self.current_section = self.outlines[self.current_outlinee][0]
|
|
geoffers@262
|
137 |
# Skip to the next step in the overall set of steps. (The walk
|
|
geoffers@262
|
138 |
# is over.)
|
|
geoffers@262
|
139 |
break
|
|
geoffers@262
|
140 |
|
|
geoffers@262
|
141 |
# If the current outlinee is null.
|
|
geoffers@262
|
142 |
elif self.current_outlinee is None:
|
|
geoffers@262
|
143 |
# Do nothing.
|
|
geoffers@262
|
144 |
pass
|
|
geoffers@262
|
145 |
|
|
geoffers@262
|
146 |
# When entering a heading content element
|
|
geoffers@262
|
147 |
elif action == "start" and element.tag in utils.heading_content:
|
|
geoffers@262
|
148 |
# If the current section has no heading, let the element being
|
|
geoffers@262
|
149 |
# entered be the heading for the current section.
|
|
geoffers@262
|
150 |
if self.current_section.header is None:
|
|
geoffers@262
|
151 |
self.current_section.header = element
|
|
geoffers@262
|
152 |
|
|
geoffers@262
|
153 |
# Otherwise, if the element being entered has a rank equal to
|
|
geoffers@262
|
154 |
# or greater than the heading of the last section of the
|
|
geoffers@262
|
155 |
# outline of the current outlinee, then create a new section
|
|
geoffers@262
|
156 |
# and append it to the outline of the current outlinee element,
|
|
geoffers@262
|
157 |
# so that this new section is the new last section of that
|
|
geoffers@262
|
158 |
# outline. Let current section be that new section. Let the
|
|
geoffers@262
|
159 |
# element being entered be the new heading for the current
|
|
geoffers@262
|
160 |
# section.
|
|
geoffers@262
|
161 |
elif rank[element.tag] >= \
|
|
geoffers@262
|
162 |
rank[self.outlines[self.current_outlinee][-1].header.tag]:
|
|
geoffers@262
|
163 |
self.current_section = section()
|
|
geoffers@262
|
164 |
self.outlines[self.current_outlinee] \
|
|
geoffers@262
|
165 |
.append(self.current_section)
|
|
geoffers@262
|
166 |
self.current_section.header = element
|
|
geoffers@262
|
167 |
|
|
geoffers@262
|
168 |
# Otherwise, run these substeps:
|
|
geoffers@262
|
169 |
else:
|
|
geoffers@262
|
170 |
# Let candidate section be current section.
|
|
geoffers@262
|
171 |
candidate_section = self.current_section
|
|
geoffers@262
|
172 |
while True:
|
|
geoffers@262
|
173 |
# If the element being entered has a rank lower than
|
|
geoffers@262
|
174 |
# the rank of the heading of the candidate section,
|
|
geoffers@262
|
175 |
# then create a new section, and append it to candidate
|
|
geoffers@262
|
176 |
# section. (This does not change which section is the
|
|
geoffers@262
|
177 |
# last section in the outline.) Let current section be
|
|
geoffers@262
|
178 |
# this new section. Let the element being entered be
|
|
geoffers@262
|
179 |
# the new heading for the current section. Abort these
|
|
geoffers@262
|
180 |
# substeps.
|
|
geoffers@263
|
181 |
if rank[element.tag] < \
|
|
geoffers@263
|
182 |
rank[candidate_section.header.tag]:
|
|
geoffers@262
|
183 |
self.current_section = section()
|
|
geoffers@262
|
184 |
candidate_section.append(self.current_section)
|
|
geoffers@262
|
185 |
self.current_section.header = element
|
|
geoffers@262
|
186 |
break
|
|
geoffers@263
|
187 |
# Let new candidate section be the section that
|
|
geoffers@263
|
188 |
# contains candidate section in the outline of current
|
|
geoffers@263
|
189 |
# outlinee.
|
|
geoffers@262
|
190 |
# Let candidate section be new candidate section.
|
|
geoffers@262
|
191 |
candidate_section = candidate_section.parent
|
|
geoffers@262
|
192 |
# Return to step 2.
|
|
geoffers@263
|
193 |
# Push the element being entered onto the stack. (This causes
|
|
geoffers@263
|
194 |
# the algorithm to skip any descendants of the element.)
|
|
geoffers@262
|
195 |
self.stack.append(element)
|
|
geoffers@262
|
196 |
|
|
geoffers@263
|
197 |
# If the current outlinee is null, then there was no sectioning content
|
|
geoffers@263
|
198 |
# element or sectioning root element in the DOM. There is no outline.
|
|
geoffers@262
|
199 |
try:
|
|
geoffers@262
|
200 |
return self.outlines[self.current_outlinee]
|
|
geoffers@262
|
201 |
except KeyError:
|
|
geoffers@263
|
202 |
return None
|