anolislib/processes/outliner.py
author Geoffrey Sneddon <geoffers@gmail.com>
Sun Feb 22 23:36:54 2009 +0000 (17 months ago)
changeset 298 b6d93515d41e
parent 2629cbda255bfdb
child 2997db2a3fe1af2
permissions -rw-r--r--
Woops. Missed a case of "encoding" that has to be "output_encoding".
     1 # coding=UTF-8
     2 # Copyright (c) 2008 Geoffrey Sneddon
     3 #
     4 # Permission is hereby granted, free of charge, to any person obtaining a copy
     5 # of this software and associated documentation files (the "Software"), to deal
     6 # in the Software without restriction, including without limitation the rights
     7 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     8 # copies of the Software, and to permit persons to whom the Software is
     9 # furnished to do so, subject to the following conditions:
    10 #
    11 # The above copyright notice and this permission notice shall be included in
    12 # all copies or substantial portions of the Software.
    13 #
    14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    16 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    17 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    18 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 # THE SOFTWARE.
    21 
    22 from lxml import etree
    23 
    24 from anolislib import utils
    25 
    26 # Rank of heading elements (these are negative so h1 > h6)
    27 rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6,
    28         u"header": -1}
    29 
    30 
    31 class section(list):
    32     """Represents the section of a document."""
    33 
    34     header = None
    35 
    36     def __repr__(self):
    37         return "<section %s>" % (repr(self.header))
    38 
    39     def append(self, child):
    40         list.append(self, child)
    41         child.parent = self
    42 
    43     def extend(self, children):
    44         list.extend(self, children)
    45         for child in children:
    46             child.parent = self
    47 
    48 
    49 class Outliner:
    50     """Build the outline of an HTML document."""
    51 
    52     def __init__(self, ElementTree, **kwargs):
    53         self.ElementTree = ElementTree
    54         self.stack = []
    55         self.outlines = {}
    56         self.current_outlinee = None
    57         self.current_section = None
    58 
    59     def build(self, **kwargs):
    60         for action, element in etree.iterwalk(self.ElementTree,
    61                                               events=("start", "end")):
    62             # If the top of the stack is an element, and you are exiting that
    63             # element
    64             if action == "end" and self.stack and self.stack[-1] == element:
    65                 # Note: The element being exited is a heading content element.
    66                 assert element.tag in utils.heading_content
    67                 # Pop that element from the stack.
    68                 self.stack.pop()
    69 
    70             # If the top of the stack is a heading content element
    71             elif self.stack and self.stack[-1].tag in utils.heading_content:
    72                 # Do nothing.
    73                 pass
    74 
    75             # When entering a sectioning content element or a sectioning root
    76             # element
    77             elif action == "start" and \
    78                  (element.tag in utils.sectioning_content or \
    79                   element.tag in utils.sectioning_root):
    80                 # If current outlinee is not null, push current outlinee onto
    81                 # the stack.
    82                 if self.current_outlinee is not None:
    83                     self.stack.append(self.current_outlinee)
    84                 # Let current outlinee be the element that is being entered.
    85                 self.current_outlinee = element
    86                 # Let current section be a newly created section for the
    87                 # current outlinee element.
    88                 self.current_section = section()
    89                 # Let there be a new outline for the new current outlinee,
    90                 # initialized with just the new current section as the only
    91                 # section in the outline.
    92                 self.outlines[self.current_outlinee] = [self.current_section]
    93 
    94             # When exiting a sectioning content element, if the stack is not
    95             # empty
    96             elif action == "end" and \
    97                  element.tag in utils.sectioning_content and self.stack:
    98                 # Pop the top element from the stack, and let the current
    99                 # outlinee be that element.
   100                 self.current_outlinee = self.stack.pop()
   101                 # Let current section be the last section in the outline of the
   102                 # current outlinee element.
   103                 self.current_section = self.outlines[self.current_outlinee][-1]
   104                 # Append the outline of the sectioning content element being
   105                 # exited to the current section. (This does not change which
   106                 # section is the last section in the outline.)
   107                 self.current_section += self.outlines[element]
   108 
   109             # When exiting a sectioning root element, if the stack is not empty
   110             elif action == "end" and element.tag in utils.sectioning_root and \
   111                  self.stack:
   112                 # Pop the top element from the stack, and let the current
   113                 # outlinee be that element.
   114                 self.current_outlinee = self.stack.pop()
   115                 # Let current section be the last section in the outline of the
   116                 # current outlinee element.
   117                 self.current_section = self.outlines[self.current_outlinee][-1]
   118                 # Loop: If current section has no child sections, stop these
   119                 # steps.
   120                 while self.current_section:
   121                     # Let current section be the last child section of the
   122                     # current current section.
   123                     assert self.current_section != self.current_section[-1]
   124                     self.current_section = self.current_section[-1]
   125                     # Go back to the substep labeled Loop.
   126 
   127             # When exiting a sectioning content element or a sectioning root
   128             # element
   129             elif action == "end" and \
   130                  (element.tag in utils.sectioning_content or \
   131                   element.tag in utils.sectioning_root):
   132                 # Note: The current outlinee is the element being exited.
   133                 assert self.current_outlinee == element
   134                 # Let current section be the first section in the outline of
   135                 # the current outlinee element.
   136                 self.current_section = self.outlines[self.current_outlinee][0]
   137                 # Skip to the next step in the overall set of steps. (The walk
   138                 # is over.)
   139                 break
   140 
   141             # If the current outlinee is null.
   142             elif self.current_outlinee is None:
   143                 # Do nothing.
   144                 pass
   145 
   146             # When entering a heading content element
   147             elif action == "start" and element.tag in utils.heading_content:
   148                 # If the current section has no heading, let the element being
   149                 # entered be the heading for the current section.
   150                 if self.current_section.header is None:
   151                     self.current_section.header = element
   152 
   153                 # Otherwise, if the element being entered has a rank equal to
   154                 # or greater than the heading of the last section of the
   155                 # outline of the current outlinee, then create a new section
   156                 # and append it to the outline of the current outlinee element,
   157                 # so that this new section is the new last section of that
   158                 # outline. Let current section be that new section. Let the
   159                 # element being entered be the new heading for the current
   160                 # section.
   161                 elif rank[element.tag] >= \
   162                      rank[self.outlines[self.current_outlinee][-1].header.tag]:
   163                     self.current_section = section()
   164                     self.outlines[self.current_outlinee] \
   165                         .append(self.current_section)
   166                     self.current_section.header = element
   167 
   168                 # Otherwise, run these substeps:
   169                 else:
   170                     # Let candidate section be current section.
   171                     candidate_section = self.current_section
   172                     while True:
   173                         # If the element being entered has a rank lower than
   174                         # the rank of the heading of the candidate section,
   175                         # then create a new section, and append it to candidate
   176                         # section. (This does not change which section is the
   177                         # last section in the outline.) Let current section be
   178                         # this new section. Let the element being entered be
   179                         # the new heading for the current section. Abort these
   180                         # substeps.
   181                         if rank[element.tag] < \
   182                            rank[candidate_section.header.tag]:
   183                             self.current_section = section()
   184                             candidate_section.append(self.current_section)
   185                             self.current_section.header = element
   186                             break
   187                         # Let new candidate section be the section that
   188                         # contains candidate section in the outline of current
   189                         # outlinee.
   190                         # Let candidate section be new candidate section.
   191                         candidate_section = candidate_section.parent
   192                         # Return to step 2.
   193                 # Push the element being entered onto the stack. (This causes
   194                 # the algorithm to skip any descendants of the element.)
   195                 self.stack.append(element)
   196 
   197         # If the current outlinee is null, then there was no sectioning content
   198         # element or sectioning root element in the DOM. There is no outline.
   199         try:
   200             return self.outlines[self.current_outlinee]
   201         except KeyError:
   202             return None