We need to import lxml.etree here.
2 # Copyright (c) 2008 Geoffrey Sneddon
4 # Permission is hereby granted, free of charge, to any person obtaining a copy
5 # of this software and associated documentation files (the "Software"), to deal
6 # in the Software without restriction, including without limitation the rights
7 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 # copies of the Software, and to permit persons to whom the Software is
9 # furnished to do so, subject to the following conditions:
11 # The above copyright notice and this permission notice shall be included in
12 # all copies or substantial portions of the Software.
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 from lxml import etree
25 from copy import deepcopy
27 from anolislib import utils
29 latest_version = re.compile(u"latest[%s]+version" % utils.spaceCharacters,
32 w3c_tr_url_status = r"http://www\.w3\.org/TR/[^/]*/(MO|WD|CR|PR|REC|PER|NOTE)-"
33 w3c_tr_url_status = re.compile(w3c_tr_url_status)
35 year = re.compile(r"\[YEAR[^\]]*\]")
36 year_sub = time.strftime(u"%Y", time.gmtime())
37 year_identifier = u"[YEAR"
39 date = re.compile(r"\[DATE[^\]]*\]")
40 date_sub = time.strftime(u"%d %B %Y", time.gmtime()).lstrip(u"0")
41 date_identifier = u"[DATE"
43 cdate = re.compile(r"\[CDATE[^\]]*\]")
44 cdate_sub = time.strftime(u"%Y%m%d", time.gmtime())
45 cdate_identifier = u"[CDATE"
47 title = re.compile(r"\[TITLE[^\]]*\]")
48 title_identifier = u"[TITLE"
50 status = re.compile(r"\[STATUS[^\]]*\]")
51 status_identifier = u"[STATUS"
53 longstatus = re.compile(r"\[LONGSTATUS[^\]]*\]")
54 longstatus_identifier = u"[LONGSTATUS"
56 u"MO": u"W3C Member-only Draft",
57 u"ED": u"Editor's Draft",
58 u"WD": u"W3C Working Draft",
59 u"CR": u"W3C Candidate Recommendation",
60 u"PR": u"W3C Proposed Recommendation",
61 u"REC": u"W3C Recommendation",
62 u"PER": u"W3C Proposed Edited Recommendation",
63 u"NOTE": u"W3C Working Group Note"
66 w3c_stylesheet = re.compile(r"http://www\.w3\.org/StyleSheets/TR/W3C-[A-Z]+")
67 w3c_stylesheet_identifier = u"http://www.w3.org/StyleSheets/TR/W3C-"
69 string_subs = ((year, year_sub, year_identifier),
70 (date, date_sub, date_identifier),
71 (cdate, cdate_sub, cdate_identifier))
74 logo_sub = etree.fromstring(u'<p><a href="http://www.w3.org/"><img alt="W3C" src="http://www.w3.org/Icons/w3c_home"/></a></p>')
76 copyright = u"copyright"
77 copyright_sub = etree.fromstring(u'<p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © %s <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a> and <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p>' % time.strftime(u"%Y", time.gmtime()))
79 basic_comment_subs = ()
83 """Perform substitutions."""
85 def __init__(self, ElementTree, w3c_compat=False,
86 w3c_compat_substitutions=False,
87 w3c_compat_crazy_substitutions=False, **kwargs):
88 if w3c_compat or w3c_compat_substitutions or \
89 w3c_compat_crazy_substitutions:
90 self.w3c_status = self.getW3CStatus(ElementTree, **kwargs)
91 self.stringSubstitutions(ElementTree, w3c_compat,
92 w3c_compat_substitutions,
93 w3c_compat_crazy_substitutions, **kwargs)
94 self.commentSubstitutions(ElementTree, w3c_compat,
95 w3c_compat_substitutions,
96 w3c_compat_crazy_substitutions, **kwargs)
98 def stringSubstitutions(self, ElementTree, w3c_compat=False,
99 w3c_compat_substitutions=False,
100 w3c_compat_crazy_substitutions=False, **kwargs):
101 # Get doc_title from the title element
103 doc_title = utils.textContent(ElementTree.getroot().find(u"head")
105 except (AttributeError, TypeError):
108 if w3c_compat or w3c_compat_substitutions:
109 # Get the right long status
110 doc_longstatus = longstatus_map[self.w3c_status]
112 if w3c_compat_crazy_substitutions:
113 # Get the right stylesheet
114 doc_w3c_stylesheet = u"http://www.w3.org/StyleSheets/TR/W3C-" + \
117 # Get all the subs we want
118 instance_string_subs = string_subs + \
119 ((title, doc_title, title_identifier), )
121 # And even more in compat. mode
122 if w3c_compat or w3c_compat_substitutions:
123 instance_string_subs += ((status, self.w3c_status,
125 (longstatus, doc_longstatus,
126 longstatus_identifier))
128 # And more that aren't even enabled by default in compat. mode
129 if w3c_compat_crazy_substitutions:
130 instance_string_subs += ((w3c_stylesheet, doc_w3c_stylesheet,
131 w3c_stylesheet_identifier), )
133 for node in ElementTree.iter():
134 for regex, sub, identifier in instance_string_subs:
135 if node.text is not None and identifier in node.text:
136 node.text = regex.sub(sub, node.text)
137 if node.tail is not None and identifier in node.tail:
138 node.tail = regex.sub(sub, node.tail)
139 for name, value in node.attrib.items():
140 if identifier in value:
141 node.attrib[name] = regex.sub(sub, value)
143 def commentSubstitutions(self, ElementTree, w3c_compat=False, \
144 w3c_compat_substitutions=False,
145 w3c_compat_crazy_substitutions=False, **kwargs):
146 # Basic substitutions
147 instance_basic_comment_subs = basic_comment_subs
149 # Add more basic substitutions in compat. mode
150 if w3c_compat or w3c_compat_substitutions:
151 instance_basic_comment_subs += ((logo, logo_sub),
152 (copyright, copyright_sub))
154 # Set of nodes to remove
159 for node in ElementTree.iter():
161 if node.tag is etree.Comment and \
162 node.text.strip(utils.spaceCharacters) == u"end-link":
163 if node.getparent() is not link_parent:
164 raise DifferentParentException(u"begin-link and end-link have different parents")
165 utils.removeInteractiveContentChildren(link)
166 link.set(u"href", utils.textContent(link))
169 if node.getparent() is link_parent:
170 link.append(deepcopy(node))
172 elif node.tag is etree.Comment and \
173 node.text.strip(utils.spaceCharacters) == u"begin-link":
174 link_parent = node.getparent()
176 link = etree.Element(u"a")
177 link.text = node.tail
181 # Basic substitutions
182 for comment, sub in instance_basic_comment_subs:
183 begin_sub = u"begin-" + comment
184 end_sub = u"end-" + comment
186 for node in ElementTree.iter():
188 if node.tag is etree.Comment and \
189 node.text.strip(utils.spaceCharacters) == end_sub:
190 if node.getparent() is not sub_parent:
191 raise DifferentParentException(u"%s and %s have different parents" % begin_sub, end_sub)
195 elif node.tag is etree.Comment:
196 if node.text.strip(utils.spaceCharacters) == begin_sub:
197 sub_parent = node.getparent()
200 node.addnext(deepcopy(sub))
201 elif node.text.strip(utils.spaceCharacters) == comment:
202 node.addprevious(etree.Comment(begin_sub))
203 node.addprevious(deepcopy(sub))
204 node.addprevious(etree.Comment(end_sub))
205 node.getprevious().tail = node.tail
209 for node in to_remove:
210 node.getparent().remove(node)
212 def getW3CStatus(self, ElementTree, **kwargs):
213 # Get all text nodes that contain case-insensitively "latest version"
214 # with any amount of whitespace inside the phrase, or contain
215 # http://www.w3.org/TR/
216 for text in ElementTree.xpath(u"//text()[contains(translate(., 'LATEST', 'latest'), 'latest') and contains(translate(., 'VERSION', 'version'), 'version') or contains(., 'http://www.w3.org/TR/')]"):
217 if latest_version.search(text):
219 elif w3c_tr_url_status.search(text):
220 return w3c_tr_url_status.search(text).group(1)
221 # Didn't find any status, return the default (ED)
226 class DifferentParentException(utils.AnolisException):
227 """begin-link and end-link do not have the same parent."""