anolislib/processes/outliner.py
author Geoffrey Sneddon <geoffers@gmail.com>
Sun Feb 22 23:36:54 2009 +0000 (2009-02-22)
changeset 298 b6d93515d41e
parent 262 9cbda255bfdb
child 299 7db2a3fe1af2
permissions -rw-r--r--
Woops. Missed a case of "encoding" that has to be "output_encoding".
geoffers@34
     1
# coding=UTF-8
geoffers@34
     2
# Copyright (c) 2008 Geoffrey Sneddon
geoffers@262
     3
#
geoffers@34
     4
# Permission is hereby granted, free of charge, to any person obtaining a copy
geoffers@34
     5
# of this software and associated documentation files (the "Software"), to deal
geoffers@34
     6
# in the Software without restriction, including without limitation the rights
geoffers@34
     7
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
geoffers@34
     8
# copies of the Software, and to permit persons to whom the Software is
geoffers@34
     9
# furnished to do so, subject to the following conditions:
geoffers@262
    10
#
geoffers@34
    11
# The above copyright notice and this permission notice shall be included in
geoffers@34
    12
# all copies or substantial portions of the Software.
geoffers@262
    13
#
geoffers@34
    14
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
geoffers@34
    15
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
geoffers@34
    16
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
geoffers@34
    17
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
geoffers@34
    18
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
geoffers@34
    19
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
geoffers@34
    20
# THE SOFTWARE.
geoffers@34
    21
geoffers@34
    22
from lxml import etree
geoffers@34
    23
geoffers@252
    24
from anolislib import utils
geoffers@34
    25
geoffers@40
    26
# Rank of heading elements (these are negative so h1 > h6)
geoffers@262
    27
rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6,
geoffers@262
    28
        u"header": -1}
geoffers@34
    29
geoffers@263
    30
geoffers@39
    31
class section(list):
geoffers@262
    32
    """Represents the section of a document."""
geoffers@39
    33
geoffers@262
    34
    header = None
geoffers@262
    35
geoffers@262
    36
    def __repr__(self):
geoffers@262
    37
        return "<section %s>" % (repr(self.header))
geoffers@262
    38
geoffers@262
    39
    def append(self, child):
geoffers@262
    40
        list.append(self, child)
geoffers@262
    41
        child.parent = self
geoffers@262
    42
geoffers@262
    43
    def extend(self, children):
geoffers@262
    44
        list.extend(self, children)
geoffers@262
    45
        for child in children:
geoffers@262
    46
            child.parent = self
geoffers@34
    47
geoffers@263
    48
geoffers@41
    49
class Outliner:
geoffers@262
    50
    """Build the outline of an HTML document."""
geoffers@262
    51
geoffers@262
    52
    def __init__(self, ElementTree, **kwargs):
geoffers@262
    53
        self.ElementTree = ElementTree
geoffers@262
    54
        self.stack = []
geoffers@262
    55
        self.outlines = {}
geoffers@262
    56
        self.current_outlinee = None
geoffers@262
    57
        self.current_section = None
geoffers@262
    58
geoffers@262
    59
    def build(self, **kwargs):
geoffers@262
    60
        for action, element in etree.iterwalk(self.ElementTree,
geoffers@262
    61
                                              events=("start", "end")):
geoffers@262
    62
            # If the top of the stack is an element, and you are exiting that
geoffers@262
    63
            # element
geoffers@262
    64
            if action == "end" and self.stack and self.stack[-1] == element:
geoffers@262
    65
                # Note: The element being exited is a heading content element.
geoffers@262
    66
                assert element.tag in utils.heading_content
geoffers@262
    67
                # Pop that element from the stack.
geoffers@262
    68
                self.stack.pop()
geoffers@262
    69
geoffers@262
    70
            # If the top of the stack is a heading content element
geoffers@262
    71
            elif self.stack and self.stack[-1].tag in utils.heading_content:
geoffers@262
    72
                # Do nothing.
geoffers@262
    73
                pass
geoffers@262
    74
geoffers@262
    75
            # When entering a sectioning content element or a sectioning root
geoffers@262
    76
            # element
geoffers@262
    77
            elif action == "start" and \
geoffers@262
    78
                 (element.tag in utils.sectioning_content or \
geoffers@262
    79
                  element.tag in utils.sectioning_root):
geoffers@262
    80
                # If current outlinee is not null, push current outlinee onto
geoffers@262
    81
                # the stack.
geoffers@262
    82
                if self.current_outlinee is not None:
geoffers@262
    83
                    self.stack.append(self.current_outlinee)
geoffers@262
    84
                # Let current outlinee be the element that is being entered.
geoffers@262
    85
                self.current_outlinee = element
geoffers@262
    86
                # Let current section be a newly created section for the
geoffers@262
    87
                # current outlinee element.
geoffers@262
    88
                self.current_section = section()
geoffers@262
    89
                # Let there be a new outline for the new current outlinee,
geoffers@262
    90
                # initialized with just the new current section as the only
geoffers@262
    91
                # section in the outline.
geoffers@262
    92
                self.outlines[self.current_outlinee] = [self.current_section]
geoffers@262
    93
geoffers@262
    94
            # When exiting a sectioning content element, if the stack is not
geoffers@262
    95
            # empty
geoffers@262
    96
            elif action == "end" and \
geoffers@262
    97
                 element.tag in utils.sectioning_content and self.stack:
geoffers@262
    98
                # Pop the top element from the stack, and let the current
geoffers@262
    99
                # outlinee be that element.
geoffers@262
   100
                self.current_outlinee = self.stack.pop()
geoffers@262
   101
                # Let current section be the last section in the outline of the
geoffers@262
   102
                # current outlinee element.
geoffers@262
   103
                self.current_section = self.outlines[self.current_outlinee][-1]
geoffers@262
   104
                # Append the outline of the sectioning content element being
geoffers@262
   105
                # exited to the current section. (This does not change which
geoffers@262
   106
                # section is the last section in the outline.)
geoffers@262
   107
                self.current_section += self.outlines[element]
geoffers@262
   108
geoffers@262
   109
            # When exiting a sectioning root element, if the stack is not empty
geoffers@262
   110
            elif action == "end" and element.tag in utils.sectioning_root and \
geoffers@262
   111
                 self.stack:
geoffers@262
   112
                # Pop the top element from the stack, and let the current
geoffers@262
   113
                # outlinee be that element.
geoffers@262
   114
                self.current_outlinee = self.stack.pop()
geoffers@262
   115
                # Let current section be the last section in the outline of the
geoffers@262
   116
                # current outlinee element.
geoffers@262
   117
                self.current_section = self.outlines[self.current_outlinee][-1]
geoffers@262
   118
                # Loop: If current section has no child sections, stop these
geoffers@262
   119
                # steps.
geoffers@262
   120
                while self.current_section:
geoffers@262
   121
                    # Let current section be the last child section of the
geoffers@262
   122
                    # current current section.
geoffers@262
   123
                    assert self.current_section != self.current_section[-1]
geoffers@262
   124
                    self.current_section = self.current_section[-1]
geoffers@262
   125
                    # Go back to the substep labeled Loop.
geoffers@262
   126
geoffers@262
   127
            # When exiting a sectioning content element or a sectioning root
geoffers@262
   128
            # element
geoffers@262
   129
            elif action == "end" and \
geoffers@262
   130
                 (element.tag in utils.sectioning_content or \
geoffers@262
   131
                  element.tag in utils.sectioning_root):
geoffers@262
   132
                # Note: The current outlinee is the element being exited.
geoffers@262
   133
                assert self.current_outlinee == element
geoffers@262
   134
                # Let current section be the first section in the outline of
geoffers@262
   135
                # the current outlinee element.
geoffers@262
   136
                self.current_section = self.outlines[self.current_outlinee][0]
geoffers@262
   137
                # Skip to the next step in the overall set of steps. (The walk
geoffers@262
   138
                # is over.)
geoffers@262
   139
                break
geoffers@262
   140
geoffers@262
   141
            # If the current outlinee is null.
geoffers@262
   142
            elif self.current_outlinee is None:
geoffers@262
   143
                # Do nothing.
geoffers@262
   144
                pass
geoffers@262
   145
geoffers@262
   146
            # When entering a heading content element
geoffers@262
   147
            elif action == "start" and element.tag in utils.heading_content:
geoffers@262
   148
                # If the current section has no heading, let the element being
geoffers@262
   149
                # entered be the heading for the current section.
geoffers@262
   150
                if self.current_section.header is None:
geoffers@262
   151
                    self.current_section.header = element
geoffers@262
   152
geoffers@262
   153
                # Otherwise, if the element being entered has a rank equal to
geoffers@262
   154
                # or greater than the heading of the last section of the
geoffers@262
   155
                # outline of the current outlinee, then create a new section
geoffers@262
   156
                # and append it to the outline of the current outlinee element,
geoffers@262
   157
                # so that this new section is the new last section of that
geoffers@262
   158
                # outline. Let current section be that new section. Let the
geoffers@262
   159
                # element being entered be the new heading for the current
geoffers@262
   160
                # section.
geoffers@262
   161
                elif rank[element.tag] >= \
geoffers@262
   162
                     rank[self.outlines[self.current_outlinee][-1].header.tag]:
geoffers@262
   163
                    self.current_section = section()
geoffers@262
   164
                    self.outlines[self.current_outlinee] \
geoffers@262
   165
                        .append(self.current_section)
geoffers@262
   166
                    self.current_section.header = element
geoffers@262
   167
geoffers@262
   168
                # Otherwise, run these substeps:
geoffers@262
   169
                else:
geoffers@262
   170
                    # Let candidate section be current section.
geoffers@262
   171
                    candidate_section = self.current_section
geoffers@262
   172
                    while True:
geoffers@262
   173
                        # If the element being entered has a rank lower than
geoffers@262
   174
                        # the rank of the heading of the candidate section,
geoffers@262
   175
                        # then create a new section, and append it to candidate
geoffers@262
   176
                        # section. (This does not change which section is the
geoffers@262
   177
                        # last section in the outline.) Let current section be
geoffers@262
   178
                        # this new section. Let the element being entered be
geoffers@262
   179
                        # the new heading for the current section. Abort these
geoffers@262
   180
                        # substeps.
geoffers@263
   181
                        if rank[element.tag] < \
geoffers@263
   182
                           rank[candidate_section.header.tag]:
geoffers@262
   183
                            self.current_section = section()
geoffers@262
   184
                            candidate_section.append(self.current_section)
geoffers@262
   185
                            self.current_section.header = element
geoffers@262
   186
                            break
geoffers@263
   187
                        # Let new candidate section be the section that
geoffers@263
   188
                        # contains candidate section in the outline of current
geoffers@263
   189
                        # outlinee.
geoffers@262
   190
                        # Let candidate section be new candidate section.
geoffers@262
   191
                        candidate_section = candidate_section.parent
geoffers@262
   192
                        # Return to step 2.
geoffers@263
   193
                # Push the element being entered onto the stack. (This causes
geoffers@263
   194
                # the algorithm to skip any descendants of the element.)
geoffers@262
   195
                self.stack.append(element)
geoffers@262
   196
geoffers@263
   197
        # If the current outlinee is null, then there was no sectioning content
geoffers@263
   198
        # element or sectioning root element in the DOM. There is no outline.
geoffers@262
   199
        try:
geoffers@262
   200
            return self.outlines[self.current_outlinee]
geoffers@262
   201
        except KeyError:
geoffers@263
   202
            return None