anolislib/processes/outliner.py
author Geoffrey Sneddon <geoffers@gmail.com>
Sun Sep 28 11:03:12 2008 +0100 (2008-09-28)
changeset 262 9cbda255bfdb
parent 252 ec226b16dbf1
child 263 9c59aa5c44cc
permissions -rw-r--r--
Start move to PEP 8 coding standards.
     1 # coding=UTF-8
     2 # Copyright (c) 2008 Geoffrey Sneddon
     3 #
     4 # Permission is hereby granted, free of charge, to any person obtaining a copy
     5 # of this software and associated documentation files (the "Software"), to deal
     6 # in the Software without restriction, including without limitation the rights
     7 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     8 # copies of the Software, and to permit persons to whom the Software is
     9 # furnished to do so, subject to the following conditions:
    10 #
    11 # The above copyright notice and this permission notice shall be included in
    12 # all copies or substantial portions of the Software.
    13 #
    14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    16 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    17 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    18 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 # THE SOFTWARE.
    21 
    22 from lxml import etree
    23 
    24 from anolislib import utils
    25 
    26 # Rank of heading elements (these are negative so h1 > h6)
    27 rank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6,
    28         u"header": -1}
    29 
    30 class section(list):
    31     """Represents the section of a document."""
    32 
    33     header = None
    34 
    35     def __repr__(self):
    36         return "<section %s>" % (repr(self.header))
    37 
    38     def append(self, child):
    39         list.append(self, child)
    40         child.parent = self
    41 
    42     def extend(self, children):
    43         list.extend(self, children)
    44         for child in children:
    45             child.parent = self
    46 
    47 class Outliner:
    48     """Build the outline of an HTML document."""
    49 
    50     def __init__(self, ElementTree, **kwargs):
    51         self.ElementTree = ElementTree
    52         self.stack = []
    53         self.outlines = {}
    54         self.current_outlinee = None
    55         self.current_section = None
    56 
    57     def build(self, **kwargs):
    58         for action, element in etree.iterwalk(self.ElementTree,
    59                                               events=("start", "end")):
    60             # If the top of the stack is an element, and you are exiting that
    61             # element
    62             if action == "end" and self.stack and self.stack[-1] == element:
    63                 # Note: The element being exited is a heading content element.
    64                 assert element.tag in utils.heading_content
    65                 # Pop that element from the stack.
    66                 self.stack.pop()
    67 
    68             # If the top of the stack is a heading content element
    69             elif self.stack and self.stack[-1].tag in utils.heading_content:
    70                 # Do nothing.
    71                 pass
    72 
    73             # When entering a sectioning content element or a sectioning root
    74             # element
    75             elif action == "start" and \
    76                  (element.tag in utils.sectioning_content or \
    77                   element.tag in utils.sectioning_root):
    78                 # If current outlinee is not null, push current outlinee onto
    79                 # the stack.
    80                 if self.current_outlinee is not None:
    81                     self.stack.append(self.current_outlinee)
    82                 # Let current outlinee be the element that is being entered.
    83                 self.current_outlinee = element
    84                 # Let current section be a newly created section for the
    85                 # current outlinee element.
    86                 self.current_section = section()
    87                 # Let there be a new outline for the new current outlinee,
    88                 # initialized with just the new current section as the only
    89                 # section in the outline.
    90                 self.outlines[self.current_outlinee] = [self.current_section]
    91 
    92             # When exiting a sectioning content element, if the stack is not
    93             # empty
    94             elif action == "end" and \
    95                  element.tag in utils.sectioning_content and self.stack:
    96                 # Pop the top element from the stack, and let the current
    97                 # outlinee be that element.
    98                 self.current_outlinee = self.stack.pop()
    99                 # Let current section be the last section in the outline of the
   100                 # current outlinee element.
   101                 self.current_section = self.outlines[self.current_outlinee][-1]
   102                 # Append the outline of the sectioning content element being
   103                 # exited to the current section. (This does not change which
   104                 # section is the last section in the outline.)
   105                 self.current_section += self.outlines[element]
   106 
   107             # When exiting a sectioning root element, if the stack is not empty
   108             elif action == "end" and element.tag in utils.sectioning_root and \
   109                  self.stack:
   110                 # Pop the top element from the stack, and let the current
   111                 # outlinee be that element.
   112                 self.current_outlinee = self.stack.pop()
   113                 # Let current section be the last section in the outline of the
   114                 # current outlinee element.
   115                 self.current_section = self.outlines[self.current_outlinee][-1]
   116                 # Loop: If current section has no child sections, stop these
   117                 # steps.
   118                 while self.current_section:
   119                     # Let current section be the last child section of the
   120                     # current current section.
   121                     assert self.current_section != self.current_section[-1]
   122                     self.current_section = self.current_section[-1]
   123                     # Go back to the substep labeled Loop.
   124 
   125             # When exiting a sectioning content element or a sectioning root
   126             # element
   127             elif action == "end" and \
   128                  (element.tag in utils.sectioning_content or \
   129                   element.tag in utils.sectioning_root):
   130                 # Note: The current outlinee is the element being exited.
   131                 assert self.current_outlinee == element
   132                 # Let current section be the first section in the outline of
   133                 # the current outlinee element.
   134                 self.current_section = self.outlines[self.current_outlinee][0]
   135                 # Skip to the next step in the overall set of steps. (The walk
   136                 # is over.)
   137                 break
   138 
   139             # If the current outlinee is null.
   140             elif self.current_outlinee is None:
   141                 # Do nothing.
   142                 pass
   143 
   144             # When entering a heading content element
   145             elif action == "start" and element.tag in utils.heading_content:
   146                 # If the current section has no heading, let the element being
   147                 # entered be the heading for the current section.
   148                 if self.current_section.header is None:
   149                     self.current_section.header = element
   150 
   151                 # Otherwise, if the element being entered has a rank equal to
   152                 # or greater than the heading of the last section of the
   153                 # outline of the current outlinee, then create a new section
   154                 # and append it to the outline of the current outlinee element,
   155                 # so that this new section is the new last section of that
   156                 # outline. Let current section be that new section. Let the
   157                 # element being entered be the new heading for the current
   158                 # section.
   159                 elif rank[element.tag] >= \
   160                      rank[self.outlines[self.current_outlinee][-1].header.tag]:
   161                     self.current_section = section()
   162                     self.outlines[self.current_outlinee] \
   163                         .append(self.current_section)
   164                     self.current_section.header = element
   165 
   166                 # Otherwise, run these substeps:
   167                 else:
   168                     # Let candidate section be current section.
   169                     candidate_section = self.current_section
   170                     while True:
   171                         # If the element being entered has a rank lower than
   172                         # the rank of the heading of the candidate section,
   173                         # then create a new section, and append it to candidate
   174                         # section. (This does not change which section is the
   175                         # last section in the outline.) Let current section be
   176                         # this new section. Let the element being entered be
   177                         # the new heading for the current section. Abort these
   178                         # substeps.
   179                         if rank[element.tag] < rank[candidate_section.header.tag]:
   180                             self.current_section = section()
   181                             candidate_section.append(self.current_section)
   182                             self.current_section.header = element
   183                             break
   184                         # Let new candidate section be the section that contains candidate section in the outline of current outlinee.
   185                         # Let candidate section be new candidate section.
   186                         candidate_section = candidate_section.parent
   187                         # Return to step 2.
   188                 # Push the element being entered onto the stack. (This causes the algorithm to skip any descendants of the element.)
   189                 self.stack.append(element)
   190 
   191         # If the current outlinee is null, then there was no sectioning content element or sectioning root element in the DOM. There is no outline.
   192         try:
   193             return self.outlines[self.current_outlinee]
   194         except KeyError:
   195             return None