anolislib/processes/sub.py
author Geoffrey Sneddon <geoffers@gmail.com>
Tue Sep 30 22:09:11 2008 +0100 (2008-09-30)
changeset 267 d0260bea7c0f
parent 262 9cbda255bfdb
permissions -rw-r--r--
We need to import lxml.etree here.
     1 # coding=UTF-8
     2 # Copyright (c) 2008 Geoffrey Sneddon
     3 #
     4 # Permission is hereby granted, free of charge, to any person obtaining a copy
     5 # of this software and associated documentation files (the "Software"), to deal
     6 # in the Software without restriction, including without limitation the rights
     7 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     8 # copies of the Software, and to permit persons to whom the Software is
     9 # furnished to do so, subject to the following conditions:
    10 #
    11 # The above copyright notice and this permission notice shall be included in
    12 # all copies or substantial portions of the Software.
    13 #
    14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    16 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    17 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    18 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    20 # THE SOFTWARE.
    21 
    22 import re
    23 import time
    24 from lxml import etree
    25 from copy import deepcopy
    26 
    27 from anolislib import utils
    28 
    29 latest_version = re.compile(u"latest[%s]+version" % utils.spaceCharacters,
    30                             re.IGNORECASE)
    31 
    32 w3c_tr_url_status = r"http://www\.w3\.org/TR/[^/]*/(MO|WD|CR|PR|REC|PER|NOTE)-"
    33 w3c_tr_url_status = re.compile(w3c_tr_url_status)
    34 
    35 year = re.compile(r"\[YEAR[^\]]*\]")
    36 year_sub = time.strftime(u"%Y", time.gmtime())
    37 year_identifier = u"[YEAR"
    38 
    39 date = re.compile(r"\[DATE[^\]]*\]")
    40 date_sub = time.strftime(u"%d %B %Y", time.gmtime()).lstrip(u"0")
    41 date_identifier = u"[DATE"
    42 
    43 cdate = re.compile(r"\[CDATE[^\]]*\]")
    44 cdate_sub = time.strftime(u"%Y%m%d", time.gmtime())
    45 cdate_identifier = u"[CDATE"
    46 
    47 title = re.compile(r"\[TITLE[^\]]*\]")
    48 title_identifier = u"[TITLE"
    49 
    50 status = re.compile(r"\[STATUS[^\]]*\]")
    51 status_identifier = u"[STATUS"
    52 
    53 longstatus = re.compile(r"\[LONGSTATUS[^\]]*\]")
    54 longstatus_identifier = u"[LONGSTATUS"
    55 longstatus_map = {
    56     u"MO": u"W3C Member-only Draft",
    57     u"ED": u"Editor's Draft",
    58     u"WD": u"W3C Working Draft",
    59     u"CR": u"W3C Candidate Recommendation",
    60     u"PR": u"W3C Proposed Recommendation",
    61     u"REC": u"W3C Recommendation",
    62     u"PER": u"W3C Proposed Edited Recommendation",
    63     u"NOTE": u"W3C Working Group Note"
    64 }
    65 
    66 w3c_stylesheet = re.compile(r"http://www\.w3\.org/StyleSheets/TR/W3C-[A-Z]+")
    67 w3c_stylesheet_identifier = u"http://www.w3.org/StyleSheets/TR/W3C-"
    68 
    69 string_subs = ((year, year_sub, year_identifier),
    70                (date, date_sub, date_identifier),
    71                (cdate, cdate_sub, cdate_identifier))
    72 
    73 logo = u"logo"
    74 logo_sub = etree.fromstring(u'<p><a href="http://www.w3.org/"><img alt="W3C" src="http://www.w3.org/Icons/w3c_home"/></a></p>')
    75 
    76 copyright = u"copyright"
    77 copyright_sub = etree.fromstring(u'<p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &#xA9; %s <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>&#xAE;</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p>' % time.strftime(u"%Y", time.gmtime()))
    78 
    79 basic_comment_subs = ()
    80 
    81 
    82 class sub(object):
    83     """Perform substitutions."""
    84 
    85     def __init__(self, ElementTree, w3c_compat=False,
    86                  w3c_compat_substitutions=False,
    87                  w3c_compat_crazy_substitutions=False, **kwargs):
    88         if w3c_compat or w3c_compat_substitutions or \
    89            w3c_compat_crazy_substitutions:
    90             self.w3c_status = self.getW3CStatus(ElementTree, **kwargs)
    91         self.stringSubstitutions(ElementTree, w3c_compat,
    92                                  w3c_compat_substitutions,
    93                                  w3c_compat_crazy_substitutions, **kwargs)
    94         self.commentSubstitutions(ElementTree, w3c_compat,
    95                                   w3c_compat_substitutions,
    96                                   w3c_compat_crazy_substitutions, **kwargs)
    97 
    98     def stringSubstitutions(self, ElementTree, w3c_compat=False,
    99                             w3c_compat_substitutions=False,
   100                             w3c_compat_crazy_substitutions=False, **kwargs):
   101         # Get doc_title from the title element
   102         try:
   103             doc_title = utils.textContent(ElementTree.getroot().find(u"head")
   104                                                                .find(u"title"))
   105         except (AttributeError, TypeError):
   106             doc_title = u""
   107 
   108         if w3c_compat or w3c_compat_substitutions:
   109             # Get the right long status
   110             doc_longstatus = longstatus_map[self.w3c_status]
   111 
   112         if w3c_compat_crazy_substitutions:
   113             # Get the right stylesheet
   114             doc_w3c_stylesheet = u"http://www.w3.org/StyleSheets/TR/W3C-" + \
   115                                  self.w3c_status
   116 
   117         # Get all the subs we want
   118         instance_string_subs = string_subs + \
   119                                ((title, doc_title, title_identifier), )
   120 
   121         # And even more in compat. mode
   122         if w3c_compat or w3c_compat_substitutions:
   123             instance_string_subs += ((status, self.w3c_status,
   124                                       status_identifier),
   125                                      (longstatus, doc_longstatus,
   126                                       longstatus_identifier))
   127 
   128         # And more that aren't even enabled by default in compat. mode
   129         if w3c_compat_crazy_substitutions:
   130             instance_string_subs += ((w3c_stylesheet, doc_w3c_stylesheet,
   131                                       w3c_stylesheet_identifier), )
   132 
   133         for node in ElementTree.iter():
   134             for regex, sub, identifier in instance_string_subs:
   135                 if node.text is not None and identifier in node.text:
   136                     node.text = regex.sub(sub, node.text)
   137                 if node.tail is not None and identifier in node.tail:
   138                     node.tail = regex.sub(sub, node.tail)
   139                 for name, value in node.attrib.items():
   140                     if identifier in value:
   141                         node.attrib[name] = regex.sub(sub, value)
   142 
   143     def commentSubstitutions(self, ElementTree, w3c_compat=False, \
   144                              w3c_compat_substitutions=False,
   145                              w3c_compat_crazy_substitutions=False, **kwargs):
   146         # Basic substitutions
   147         instance_basic_comment_subs = basic_comment_subs
   148 
   149         # Add more basic substitutions in compat. mode
   150         if w3c_compat or w3c_compat_substitutions:
   151             instance_basic_comment_subs += ((logo, logo_sub),
   152                                             (copyright, copyright_sub))
   153 
   154         # Set of nodes to remove
   155         to_remove = set()
   156 
   157         # Link
   158         in_link = False
   159         for node in ElementTree.iter():
   160             if in_link:
   161                 if node.tag is etree.Comment and \
   162                    node.text.strip(utils.spaceCharacters) == u"end-link":
   163                     if node.getparent() is not link_parent:
   164                         raise DifferentParentException(u"begin-link and end-link have different parents")
   165                     utils.removeInteractiveContentChildren(link)
   166                     link.set(u"href", utils.textContent(link))
   167                     in_link = False
   168                 else:
   169                     if node.getparent() is link_parent:
   170                         link.append(deepcopy(node))
   171                     to_remove.add(node)
   172             elif node.tag is etree.Comment and \
   173                  node.text.strip(utils.spaceCharacters) == u"begin-link":
   174                 link_parent = node.getparent()
   175                 in_link = True
   176                 link = etree.Element(u"a")
   177                 link.text = node.tail
   178                 node.tail = None
   179                 node.addnext(link)
   180 
   181         # Basic substitutions
   182         for comment, sub in instance_basic_comment_subs:
   183             begin_sub = u"begin-" + comment
   184             end_sub = u"end-" + comment
   185             in_sub = False
   186             for node in ElementTree.iter():
   187                 if in_sub:
   188                     if node.tag is etree.Comment and \
   189                        node.text.strip(utils.spaceCharacters) == end_sub:
   190                         if node.getparent() is not sub_parent:
   191                             raise DifferentParentException(u"%s and %s have different parents" % begin_sub, end_sub)
   192                         in_sub = False
   193                     else:
   194                         to_remove.add(node)
   195                 elif node.tag is etree.Comment:
   196                     if node.text.strip(utils.spaceCharacters) == begin_sub:
   197                         sub_parent = node.getparent()
   198                         in_sub = True
   199                         node.tail = None
   200                         node.addnext(deepcopy(sub))
   201                     elif node.text.strip(utils.spaceCharacters) == comment:
   202                         node.addprevious(etree.Comment(begin_sub))
   203                         node.addprevious(deepcopy(sub))
   204                         node.addprevious(etree.Comment(end_sub))
   205                         node.getprevious().tail = node.tail
   206                         to_remove.add(node)
   207 
   208         # Remove nodes
   209         for node in to_remove:
   210             node.getparent().remove(node)
   211 
   212     def getW3CStatus(self, ElementTree, **kwargs):
   213         # Get all text nodes that contain case-insensitively "latest version"
   214         # with any amount of whitespace inside the phrase, or contain
   215         # http://www.w3.org/TR/
   216         for text in ElementTree.xpath(u"//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
   217             if latest_version.search(text):
   218                 return u"ED"
   219             elif w3c_tr_url_status.search(text):
   220                 return w3c_tr_url_status.search(text).group(1)
   221         # Didn't find any status, return the default (ED)
   222         else:
   223             return u"ED"
   224 
   225 
   226 class DifferentParentException(utils.AnolisException):
   227     """begin-link and end-link do not have the same parent."""
   228     pass