anolislib/processes/outliner.py
author Geoffrey Sneddon <geoffers@gmail.com>
Mon Jul 13 14:10:19 2009 +0200 (2009-07-13)
changeset 299 7db2a3fe1af2
parent 263 9c59aa5c44cc
permissions -rw-r--r--
Update Outliner to match the latest version of the spec; header/hgroup change and hgroup rank change.
     1 # coding=UTF-8
     2 # Copyright (c) 2008 Geoffrey Sneddon
     3 #
     4 # Permission is hereby granted, free of charge, to any person obtaining a copy
     5 # of this software and associated documentation files (the "Software"), to deal
     6 # in the Software without restriction, including without limitation the rights
     7 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     8 # copies of the Software, and to permit persons to whom the Software is
     9 # furnished to do so, subject to the following conditions:
    10 #
    11 # The above copyright notice and this permission notice shall be included in
    12 # all copies or substantial portions of the Software.
    13 #
    14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    16 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    17 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    18 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 # THE SOFTWARE.
    21 
    22 from lxml import etree
    23 
    24 from anolislib import utils
    25 
    26 # Rank of heading elements (these are negative so h1 > h6)
    27 fixedRank = {u"h1": -1, u"h2": -2, u"h3": -3, u"h4": -4, u"h5": -5, u"h6": -6}
    28 
    29 
    30 class section(list):
    31     """Represents the section of a document."""
    32 
    33     header = None
    34 
    35     def __repr__(self):
    36         return "<section %s>" % (repr(self.header))
    37 
    38     def append(self, child):
    39         list.append(self, child)
    40         child.parent = self
    41 
    42     def extend(self, children):
    43         list.extend(self, children)
    44         for child in children:
    45             child.parent = self
    46 
    47 
    48 class Outliner:
    49     """Build the outline of an HTML document."""
    50 
    51     def __init__(self, ElementTree, **kwargs):
    52         self.ElementTree = ElementTree
    53         self.stack = []
    54         self.outlines = {}
    55         self.current_outlinee = None
    56         self.current_section = None
    57     
    58     def _rank(self, element):
    59         if element.tag in fixedRank:
    60             return fixedRank[element.tag]
    61         # The rank of an hgroup element is the rank of the highest-ranked
    62         # h1–h6 element descendant of the hgroup element, if there are any such
    63         # elements, or otherwise the same as for an h1 element (the highest
    64         # rank).
    65         elif element.tag == u"hgroup":
    66             for i in range(1, 6):
    67                 if element.find(u".//h" + unicode(i)) is not None:
    68                     return -i
    69             else:
    70                 return -1
    71         else:
    72             raise ValueError, "Only h1–h6 and hgroup elements have a rank"
    73 
    74     def build(self, **kwargs):
    75         for action, element in etree.iterwalk(self.ElementTree,
    76                                               events=("start", "end")):
    77             # If the top of the stack is an element, and you are exiting that
    78             # element
    79             if action == "end" and self.stack and self.stack[-1] == element:
    80                 # Note: The element being exited is a heading content element.
    81                 assert element.tag in utils.heading_content
    82                 # Pop that element from the stack.
    83                 self.stack.pop()
    84 
    85             # If the top of the stack is a heading content element
    86             elif self.stack and self.stack[-1].tag in utils.heading_content:
    87                 # Do nothing.
    88                 pass
    89 
    90             # When entering a sectioning content element or a sectioning root
    91             # element
    92             elif action == "start" and \
    93                  (element.tag in utils.sectioning_content or \
    94                   element.tag in utils.sectioning_root):
    95                 # If current outlinee is not null, push current outlinee onto
    96                 # the stack.
    97                 if self.current_outlinee is not None:
    98                     self.stack.append(self.current_outlinee)
    99                 # Let current outlinee be the element that is being entered.
   100                 self.current_outlinee = element
   101                 # Let current section be a newly created section for the
   102                 # current outlinee element.
   103                 self.current_section = section()
   104                 # Let there be a new outline for the new current outlinee,
   105                 # initialized with just the new current section as the only
   106                 # section in the outline.
   107                 self.outlines[self.current_outlinee] = [self.current_section]
   108 
   109             # When exiting a sectioning content element, if the stack is not
   110             # empty
   111             elif action == "end" and \
   112                  element.tag in utils.sectioning_content and self.stack:
   113                 # Pop the top element from the stack, and let the current
   114                 # outlinee be that element.
   115                 self.current_outlinee = self.stack.pop()
   116                 # Let current section be the last section in the outline of the
   117                 # current outlinee element.
   118                 self.current_section = self.outlines[self.current_outlinee][-1]
   119                 # Append the outline of the sectioning content element being
   120                 # exited to the current section. (This does not change which
   121                 # section is the last section in the outline.)
   122                 self.current_section += self.outlines[element]
   123 
   124             # When exiting a sectioning root element, if the stack is not empty
   125             elif action == "end" and element.tag in utils.sectioning_root and \
   126                  self.stack:
   127                 # Pop the top element from the stack, and let the current
   128                 # outlinee be that element.
   129                 self.current_outlinee = self.stack.pop()
   130                 # Let current section be the last section in the outline of the
   131                 # current outlinee element.
   132                 self.current_section = self.outlines[self.current_outlinee][-1]
   133                 # Loop: If current section has no child sections, stop these
   134                 # steps.
   135                 while self.current_section:
   136                     # Let current section be the last child section of the
   137                     # current current section.
   138                     assert self.current_section != self.current_section[-1]
   139                     self.current_section = self.current_section[-1]
   140                     # Go back to the substep labeled Loop.
   141 
   142             # When exiting a sectioning content element or a sectioning root
   143             # element
   144             elif action == "end" and \
   145                  (element.tag in utils.sectioning_content or \
   146                   element.tag in utils.sectioning_root):
   147                 # Note: The current outlinee is the element being exited.
   148                 assert self.current_outlinee == element
   149                 # Let current section be the first section in the outline of
   150                 # the current outlinee element.
   151                 self.current_section = self.outlines[self.current_outlinee][0]
   152                 # Skip to the next step in the overall set of steps. (The walk
   153                 # is over.)
   154                 break
   155 
   156             # If the current outlinee is null.
   157             elif self.current_outlinee is None:
   158                 # Do nothing.
   159                 pass
   160 
   161             # When entering a heading content element
   162             elif action == "start" and element.tag in utils.heading_content:
   163                 # If the current section has no heading, let the element being
   164                 # entered be the heading for the current section.
   165                 if self.current_section.header is None:
   166                     self.current_section.header = element
   167 
   168                 # Otherwise, if the element being entered has a rank equal to
   169                 # or greater than the heading of the last section of the
   170                 # outline of the current outlinee, then create a new section
   171                 # and append it to the outline of the current outlinee element,
   172                 # so that this new section is the new last section of that
   173                 # outline. Let current section be that new section. Let the
   174                 # element being entered be the new heading for the current
   175                 # section.
   176                 elif self._rank(element) >= \
   177                      self._rank(self.outlines[self.current_outlinee][-1].header):
   178                     self.current_section = section()
   179                     self.outlines[self.current_outlinee] \
   180                         .append(self.current_section)
   181                     self.current_section.header = element
   182 
   183                 # Otherwise, run these substeps:
   184                 else:
   185                     # Let candidate section be current section.
   186                     candidate_section = self.current_section
   187                     while True:
   188                         # If the element being entered has a rank lower than
   189                         # the rank of the heading of the candidate section,
   190                         # then create a new section, and append it to candidate
   191                         # section. (This does not change which section is the
   192                         # last section in the outline.) Let current section be
   193                         # this new section. Let the element being entered be
   194                         # the new heading for the current section. Abort these
   195                         # substeps.
   196                         if self._rank(element) < \
   197                            self._rank(candidate_section.header):
   198                             self.current_section = section()
   199                             candidate_section.append(self.current_section)
   200                             self.current_section.header = element
   201                             break
   202                         # Let new candidate section be the section that
   203                         # contains candidate section in the outline of current
   204                         # outlinee.
   205                         # Let candidate section be new candidate section.
   206                         candidate_section = candidate_section.parent
   207                         # Return to step 2.
   208                 # Push the element being entered onto the stack. (This causes
   209                 # the algorithm to skip any descendants of the element.)
   210                 self.stack.append(element)
   211 
   212         # If the current outlinee is null, then there was no sectioning content
   213         # element or sectioning root element in the DOM. There is no outline.
   214         try:
   215             return self.outlines[self.current_outlinee]
   216         except KeyError:
   217             return None