':
+ depth = 0
+ for piece in pieces[:-1]:
+ if piece.startswith(''):
+ depth -= 1
+ if depth == 0:
+ break
+ elif piece.startswith('<') and not piece.endswith('/>'):
+ depth += 1
+ else:
+ pieces = pieces[1:-1]
+
+ # Ensure each piece is a str for Python 3
+ for (i, v) in enumerate(pieces):
+ if not isinstance(v, str):
+ pieces[i] = v.decode('utf-8')
+
output = ''.join(pieces)
if stripWhitespace:
output = output.strip()
- if not expectingText: return output
+ if not expectingText:
+ return output
# decode base64 content
if base64 and self.contentparams.get('base64', 0):
try:
- output = base64.decodestring(output)
+ output = _base64decode(output)
except binascii.Error:
pass
except binascii.Incomplete:
pass
-
+ except TypeError:
+ # In Python 3, base64 takes and outputs bytes, not str
+ # This may not be the most correct way to accomplish this
+ output = _base64decode(output.encode('utf-8')).decode('utf-8')
+
# resolve relative URIs
if (element in self.can_be_relative_uri) and output:
output = self.resolveURI(output)
-
+
# decode entities within embedded markup
if not self.contentparams.get('base64', 0):
output = self.decodeEntities(element, output)
+ # some feed formats require consumers to guess
+ # whether the content is html or plain text
+ if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain':
+ if self.lookslikehtml(output):
+ self.contentparams['type'] = 'text/html'
+
# remove temporary cruft from contentparams
try:
del self.contentparams['mode']
@@ -635,26 +914,55 @@ class _FeedParserMixin:
except KeyError:
pass
+ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup
- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
+ if is_htmlish and RESOLVE_RELATIVE_URIS:
if element in self.can_contain_relative_uris:
- output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
-
- # sanitize embedded markup
- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
- if element in self.can_contain_dangerous_markup:
- output = _sanitizeHTML(output, self.encoding)
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
- if self.encoding and type(output) != type(u''):
+ # parse microformats
+ # (must do this before sanitizing because some microformats
+ # rely on elements that we sanitize)
+ if PARSE_MICROFORMATS and is_htmlish and element in ['content', 'description', 'summary']:
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
+ if mfresults:
+ for tag in mfresults.get('tags', []):
+ self._addTag(tag['term'], tag['scheme'], tag['label'])
+ for enclosure in mfresults.get('enclosures', []):
+ self._start_enclosure(enclosure)
+ for xfn in mfresults.get('xfn', []):
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
+ vcard = mfresults.get('vcard')
+ if vcard:
+ self._getContext()['vcard'] = vcard
+
+ # sanitize embedded markup
+ if is_htmlish and SANITIZE_HTML:
+ if element in self.can_contain_dangerous_markup:
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
+
+ if self.encoding and not isinstance(output, str):
+ output = output.decode(self.encoding, 'ignore')
+
+ # address common error where people take data that is already
+ # utf-8, presume that it is iso-8859-1, and re-encode it.
+ if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and isinstance(output, str):
try:
- output = unicode(output, self.encoding)
- except:
+ output = output.encode('iso-8859-1').decode('utf-8')
+ except (UnicodeEncodeError, UnicodeDecodeError):
pass
+ # map win-1252 extensions to the proper code points
+ if isinstance(output, str):
+ output = output.translate(_cp1252)
+
# categories/tags/keywords/whatever are handled in _end_category
if element == 'category':
return output
-
+
+ if element == 'title' and -1 < self.title_depth <= self.depth:
+ return output
+
# store output in appropriate place(s)
if self.inentry and not self.insource:
if element == 'content':
@@ -663,23 +971,34 @@ class _FeedParserMixin:
contentparams['value'] = output
self.entries[-1][element].append(contentparams)
elif element == 'link':
- self.entries[-1][element] = output
- if output:
- self.entries[-1]['links'][-1]['href'] = output
+ if not self.inimage:
+ # query variables in urls in link elements are improperly
+ # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
+ # unhandled character references. fix this special case.
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ self.entries[-1][element] = output
+ if output:
+ self.entries[-1]['links'][-1]['href'] = output
else:
if element == 'description':
element = 'summary'
- self.entries[-1][element] = output
+ old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
+ if old_value_depth is None or self.depth <= old_value_depth:
+ self.property_depth_map[self.entries[-1]][element] = self.depth
+ self.entries[-1][element] = output
if self.incontent:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element + '_detail'] = contentparams
- elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
context = self._getContext()
if element == 'description':
element = 'subtitle'
context[element] = output
if element == 'link':
+ # fix query variables; see above for the explanation
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ context[element] = output
context['links'][-1]['href'] = output
elif self.incontent:
contentparams = copy.deepcopy(self.contentparams)
@@ -689,6 +1008,8 @@ class _FeedParserMixin:
def pushContent(self, tag, attrsD, defaultContentType, expectingText):
self.incontent += 1
+ if self.lang:
+ self.lang=self.lang.replace('_','-')
self.contentparams = FeedParserDict({
'type': self.mapContentType(attrsD.get('type', defaultContentType)),
'language': self.lang,
@@ -701,16 +1022,36 @@ class _FeedParserMixin:
self.incontent -= 1
self.contentparams.clear()
return value
-
+
+ # a number of elements in a number of RSS variants are nominally plain
+ # text, but this is routinely ignored. This is an attempt to detect
+ # the most common cases. As false positives often result in silent
+ # data loss, this function errs on the conservative side.
+ @staticmethod
+ def lookslikehtml(s):
+ # must have a close tag or an entity reference to qualify
+ if not (re.search(r'(\w+)>',s) or re.search("?\w+;",s)):
+ return
+
+ # all tags must be in a restricted subset of valid HTML tags
+ if [t for t in re.findall(r'?(\w+)',s) if t.lower() not in _HTMLSanitizer.acceptable_elements]:
+ return
+
+ # all entities must have been defined as valid HTML entities
+ if [e for e in re.findall(r'&(\w+);', s) if e not in list(entitydefs.keys())]:
+ return
+
+ return 1
+
def _mapToStandardPrefix(self, name):
colonpos = name.find(':')
- if colonpos <> -1:
+ if colonpos != -1:
prefix = name[:colonpos]
suffix = name[colonpos+1:]
prefix = self.namespacemap.get(prefix, prefix)
name = prefix + ':' + suffix
return name
-
+
def _getAttribute(self, attrsD, name):
return attrsD.get(self._mapToStandardPrefix(name))
@@ -738,17 +1079,23 @@ class _FeedParserMixin:
pass
attrsD['href'] = href
return attrsD
-
- def _save(self, key, value):
+
+ def _save(self, key, value, overwrite=False):
context = self._getContext()
- context.setdefault(key, value)
+ if overwrite:
+ context[key] = value
+ else:
+ context.setdefault(key, value)
def _start_rss(self, attrsD):
versionmap = {'0.91': 'rss091u',
'0.92': 'rss092',
'0.93': 'rss093',
'0.94': 'rss094'}
- if not self.version:
+ #If we're here then this is an RSS feed.
+ #If we don't have a version or have a version that starts with something
+ #other than RSS then there's been a mistake. Correct it.
+ if not self.version or not self.version.startswith('rss'):
attr_version = attrsD.get('version', '')
version = versionmap.get(attr_version)
if version:
@@ -757,25 +1104,21 @@ class _FeedParserMixin:
self.version = 'rss20'
else:
self.version = 'rss'
-
- def _start_dlhottitles(self, attrsD):
- self.version = 'hotrss'
def _start_channel(self, attrsD):
self.infeed = 1
self._cdf_common(attrsD)
- _start_feedinfo = _start_channel
def _cdf_common(self, attrsD):
- if attrsD.has_key('lastmod'):
+ if 'lastmod' in attrsD:
self._start_modified({})
self.elementstack[-1][-1] = attrsD['lastmod']
self._end_modified()
- if attrsD.has_key('href'):
+ if 'href' in attrsD:
self._start_link({})
self.elementstack[-1][-1] = attrsD['href']
self._end_link()
-
+
def _start_feed(self, attrsD):
self.infeed = 1
versionmap = {'0.1': 'atom01',
@@ -792,24 +1135,27 @@ class _FeedParserMixin:
def _end_channel(self):
self.infeed = 0
_end_feed = _end_channel
-
+
def _start_image(self, attrsD):
- self.inimage = 1
- self.push('image', 0)
context = self._getContext()
- context.setdefault('image', FeedParserDict())
-
+ if not self.inentry:
+ context.setdefault('image', FeedParserDict())
+ self.inimage = 1
+ self.title_depth = -1
+ self.push('image', 0)
+
def _end_image(self):
self.pop('image')
self.inimage = 0
def _start_textinput(self, attrsD):
- self.intextinput = 1
- self.push('textinput', 0)
context = self._getContext()
context.setdefault('textinput', FeedParserDict())
+ self.intextinput = 1
+ self.title_depth = -1
+ self.push('textinput', 0)
_start_textInput = _start_textinput
-
+
def _end_textinput(self):
self.pop('textinput')
self.intextinput = 0
@@ -818,6 +1164,10 @@ class _FeedParserMixin:
def _start_author(self, attrsD):
self.inauthor = 1
self.push('author', 1)
+ # Append a new FeedParserDict when expecting an author
+ context = self._getContext()
+ context.setdefault('authors', [])
+ context['authors'].append(FeedParserDict())
_start_managingeditor = _start_author
_start_dc_author = _start_author
_start_dc_creator = _start_author
@@ -877,7 +1227,7 @@ class _FeedParserMixin:
self._save_contributor('name', value)
elif self.intextinput:
context = self._getContext()
- context['textinput']['name'] = value
+ context['name'] = value
_end_itunes_name = _end_name
def _start_width(self, attrsD):
@@ -887,11 +1237,11 @@ class _FeedParserMixin:
value = self.pop('width')
try:
value = int(value)
- except:
+ except ValueError:
value = 0
if self.inimage:
context = self._getContext()
- context['image']['width'] = value
+ context['width'] = value
def _start_height(self, attrsD):
self.push('height', 0)
@@ -900,11 +1250,11 @@ class _FeedParserMixin:
value = self.pop('height')
try:
value = int(value)
- except:
+ except ValueError:
value = 0
if self.inimage:
context = self._getContext()
- context['image']['height'] = value
+ context['height'] = value
def _start_url(self, attrsD):
self.push('href', 1)
@@ -917,12 +1267,6 @@ class _FeedParserMixin:
self._save_author('href', value)
elif self.incontributor:
self._save_contributor('href', value)
- elif self.inimage:
- context = self._getContext()
- context['image']['href'] = value
- elif self.intextinput:
- context = self._getContext()
- context['textinput']['link'] = value
_end_homepage = _end_url
_end_uri = _end_url
@@ -943,6 +1287,10 @@ class _FeedParserMixin:
def _getContext(self):
if self.insource:
context = self.sourcedata
+ elif self.inimage and 'image' in self.feeddata:
+ context = self.feeddata['image']
+ elif self.intextinput:
+ context = self.feeddata['textinput']
elif self.inentry:
context = self.entries[-1]
else:
@@ -954,6 +1302,8 @@ class _FeedParserMixin:
context.setdefault(prefix + '_detail', FeedParserDict())
context[prefix + '_detail'][key] = value
self._sync_author_detail()
+ context.setdefault('authors', [FeedParserDict()])
+ context['authors'][-1][key] = value
def _save_contributor(self, key, value):
context = self._getContext()
@@ -973,23 +1323,29 @@ class _FeedParserMixin:
elif email:
context[key] = email
else:
- author = context.get(key)
- if not author: return
- emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
- if not emailmatch: return
- email = emailmatch.group(0)
- # probably a better way to do the following, but it passes all the tests
- author = author.replace(email, '')
- author = author.replace('()', '')
- author = author.strip()
- if author and (author[0] == '('):
- author = author[1:]
- if author and (author[-1] == ')'):
- author = author[:-1]
- author = author.strip()
- context.setdefault('%s_detail' % key, FeedParserDict())
- context['%s_detail' % key]['name'] = author
- context['%s_detail' % key]['email'] = email
+ author, email = context.get(key), None
+ if not author:
+ return
+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
+ if emailmatch:
+ email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
+ author = author.replace(email, '')
+ author = author.replace('()', '')
+ author = author.replace('<>', '')
+ author = author.replace('<>', '')
+ author = author.strip()
+ if author and (author[0] == '('):
+ author = author[1:]
+ if author and (author[-1] == ')'):
+ author = author[:-1]
+ author = author.strip()
+ if author or email:
+ context.setdefault('%s_detail' % key, FeedParserDict())
+ if author:
+ context['%s_detail' % key]['name'] = author
+ if email:
+ context['%s_detail' % key]['email'] = email
def _start_subtitle(self, attrsD):
self.pushContent('subtitle', attrsD, 'text/plain', 1)
@@ -1000,7 +1356,7 @@ class _FeedParserMixin:
self.popContent('subtitle')
_end_tagline = _end_subtitle
_end_itunes_subtitle = _end_subtitle
-
+
def _start_rights(self, attrsD):
self.pushContent('rights', attrsD, 'text/plain', 1)
_start_dc_rights = _start_rights
@@ -1016,13 +1372,13 @@ class _FeedParserMixin:
self.push('item', 0)
self.inentry = 1
self.guidislink = 0
+ self.title_depth = -1
id = self._getAttribute(attrsD, 'rdf:about')
if id:
context = self._getContext()
context['id'] = id
self._cdf_common(attrsD)
_start_entry = _start_item
- _start_product = _start_item
def _end_item(self):
self.pop('item')
@@ -1050,28 +1406,30 @@ class _FeedParserMixin:
self.push('published', 1)
_start_dcterms_issued = _start_published
_start_issued = _start_published
+ _start_pubdate = _start_published
def _end_published(self):
value = self.pop('published')
- self._save('published_parsed', _parse_date(value))
+ self._save('published_parsed', _parse_date(value), overwrite=True)
_end_dcterms_issued = _end_published
_end_issued = _end_published
+ _end_pubdate = _end_published
def _start_updated(self, attrsD):
self.push('updated', 1)
_start_modified = _start_updated
_start_dcterms_modified = _start_updated
- _start_pubdate = _start_updated
_start_dc_date = _start_updated
+ _start_lastbuilddate = _start_updated
def _end_updated(self):
value = self.pop('updated')
parsed_value = _parse_date(value)
- self._save('updated_parsed', parsed_value)
+ self._save('updated_parsed', parsed_value, overwrite=True)
_end_modified = _end_updated
_end_dcterms_modified = _end_updated
- _end_pubdate = _end_updated
_end_dc_date = _end_updated
+ _end_lastbuilddate = _end_updated
def _start_created(self, attrsD):
self.push('created', 1)
@@ -1079,38 +1437,56 @@ class _FeedParserMixin:
def _end_created(self):
value = self.pop('created')
- self._save('created_parsed', _parse_date(value))
+ self._save('created_parsed', _parse_date(value), overwrite=True)
_end_dcterms_created = _end_created
def _start_expirationdate(self, attrsD):
self.push('expired', 1)
def _end_expirationdate(self):
- self._save('expired_parsed', _parse_date(self.pop('expired')))
+ self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
def _start_cc_license(self, attrsD):
- self.push('license', 1)
+ context = self._getContext()
value = self._getAttribute(attrsD, 'rdf:resource')
+ attrsD = FeedParserDict()
+ attrsD['rel'] = 'license'
if value:
- self.elementstack[-1][2].append(value)
- self.pop('license')
-
+ attrsD['href']=value
+ context.setdefault('links', []).append(attrsD)
+
def _start_creativecommons_license(self, attrsD):
self.push('license', 1)
+ _start_creativeCommons_license = _start_creativecommons_license
def _end_creativecommons_license(self):
- self.pop('license')
+ value = self.pop('license')
+ context = self._getContext()
+ attrsD = FeedParserDict()
+ attrsD['rel'] = 'license'
+ if value:
+ attrsD['href'] = value
+ context.setdefault('links', []).append(attrsD)
+ del context['license']
+ _end_creativeCommons_license = _end_creativecommons_license
+
+ def _addXFN(self, relationships, href, name):
+ context = self._getContext()
+ xfn = context.setdefault('xfn', [])
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
+ if value not in xfn:
+ xfn.append(value)
def _addTag(self, term, scheme, label):
context = self._getContext()
tags = context.setdefault('tags', [])
- if (not term) and (not scheme) and (not label): return
+ if (not term) and (not scheme) and (not label):
+ return
value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
if value not in tags:
- tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
+ tags.append(value)
def _start_category(self, attrsD):
- if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
term = attrsD.get('term')
scheme = attrsD.get('scheme', attrsD.get('domain'))
label = attrsD.get('label')
@@ -1118,18 +1494,24 @@ class _FeedParserMixin:
self.push('category', 1)
_start_dc_subject = _start_category
_start_keywords = _start_category
-
+
+ def _start_media_category(self, attrsD):
+ attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
+ self._start_category(attrsD)
+
def _end_itunes_keywords(self):
- for term in self.pop('itunes_keywords').split():
- self._addTag(term, 'http://www.itunes.com/', None)
-
+ for term in self.pop('itunes_keywords').split(','):
+ if term.strip():
+ self._addTag(term.strip(), 'http://www.itunes.com/', None)
+
def _start_itunes_category(self, attrsD):
self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
self.push('category', 1)
-
+
def _end_category(self):
value = self.pop('category')
- if not value: return
+ if not value:
+ return
context = self._getContext()
tags = context['tags']
if value and len(tags) and not tags[-1]['term']:
@@ -1139,73 +1521,78 @@ class _FeedParserMixin:
_end_dc_subject = _end_category
_end_keywords = _end_category
_end_itunes_category = _end_category
+ _end_media_category = _end_category
def _start_cloud(self, attrsD):
self._getContext()['cloud'] = FeedParserDict(attrsD)
-
+
def _start_link(self, attrsD):
attrsD.setdefault('rel', 'alternate')
- attrsD.setdefault('type', 'text/html')
+ if attrsD['rel'] == 'self':
+ attrsD.setdefault('type', 'application/atom+xml')
+ else:
+ attrsD.setdefault('type', 'text/html')
+ context = self._getContext()
attrsD = self._itsAnHrefDamnIt(attrsD)
- if attrsD.has_key('href'):
+ if 'href' in attrsD:
attrsD['href'] = self.resolveURI(attrsD['href'])
expectingText = self.infeed or self.inentry or self.insource
- context = self._getContext()
context.setdefault('links', [])
- context['links'].append(FeedParserDict(attrsD))
- if attrsD['rel'] == 'enclosure':
- self._start_enclosure(attrsD)
- if attrsD.has_key('href'):
+ if not (self.inentry and self.inimage):
+ context['links'].append(FeedParserDict(attrsD))
+ if 'href' in attrsD:
expectingText = 0
if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
context['link'] = attrsD['href']
else:
self.push('link', expectingText)
- _start_producturl = _start_link
def _end_link(self):
value = self.pop('link')
- context = self._getContext()
- if self.intextinput:
- context['textinput']['link'] = value
- if self.inimage:
- context['image']['link'] = value
- _end_producturl = _end_link
def _start_guid(self, attrsD):
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
self.push('id', 1)
+ _start_id = _start_guid
def _end_guid(self):
value = self.pop('id')
- self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
+ self._save('guidislink', self.guidislink and 'link' not in self._getContext())
if self.guidislink:
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
# and only if the item doesn't already have a link element
self._save('link', value)
+ _end_id = _end_guid
def _start_title(self, attrsD):
+ if self.svgOK:
+ return self.unknown_starttag('title', list(attrsD.items()))
self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
_start_dc_title = _start_title
_start_media_title = _start_title
def _end_title(self):
+ if self.svgOK:
+ return
value = self.popContent('title')
- context = self._getContext()
- if self.intextinput:
- context['textinput']['title'] = value
- elif self.inimage:
- context['image']['title'] = value
+ if not value:
+ return
+ self.title_depth = self.depth
_end_dc_title = _end_title
- _end_media_title = _end_title
+
+ def _end_media_title(self):
+ title_depth = self.title_depth
+ self._end_title()
+ self.title_depth = title_depth
def _start_description(self, attrsD):
context = self._getContext()
- if context.has_key('summary'):
+ if 'summary' in context:
self._summaryKey = 'content'
self._start_content(attrsD)
else:
self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
+ _start_dc_description = _start_description
def _start_abstract(self, attrsD):
self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
@@ -1215,13 +1602,9 @@ class _FeedParserMixin:
self._end_content()
else:
value = self.popContent('description')
- context = self._getContext()
- if self.intextinput:
- context['textinput']['description'] = value
- elif self.inimage:
- context['image']['description'] = value
self._summaryKey = None
_end_abstract = _end_description
+ _end_dc_description = _end_description
def _start_info(self, attrsD):
self.pushContent('info', attrsD, 'text/plain', 1)
@@ -1234,7 +1617,7 @@ class _FeedParserMixin:
def _start_generator(self, attrsD):
if attrsD:
attrsD = self._itsAnHrefDamnIt(attrsD)
- if attrsD.has_key('href'):
+ if 'href' in attrsD:
attrsD['href'] = self.resolveURI(attrsD['href'])
self._getContext()['generator_detail'] = FeedParserDict(attrsD)
self.push('generator', 1)
@@ -1242,9 +1625,9 @@ class _FeedParserMixin:
def _end_generator(self):
value = self.pop('generator')
context = self._getContext()
- if context.has_key('generator_detail'):
+ if 'generator_detail' in context:
context['generator_detail']['name'] = value
-
+
def _start_admin_generatoragent(self, attrsD):
self.push('generator', 1)
value = self._getAttribute(attrsD, 'rdf:resource')
@@ -1259,10 +1642,10 @@ class _FeedParserMixin:
if value:
self.elementstack[-1][2].append(value)
self.pop('errorreportsto')
-
+
def _start_summary(self, attrsD):
context = self._getContext()
- if context.has_key('summary'):
+ if 'summary' in context:
self._summaryKey = 'content'
self._start_content(attrsD)
else:
@@ -1277,21 +1660,26 @@ class _FeedParserMixin:
self.popContent(self._summaryKey or 'summary')
self._summaryKey = None
_end_itunes_summary = _end_summary
-
+
def _start_enclosure(self, attrsD):
attrsD = self._itsAnHrefDamnIt(attrsD)
- self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
- href = attrsD.get('href')
- if href:
- context = self._getContext()
- if not context.get('id'):
- context['id'] = href
-
+ context = self._getContext()
+ attrsD['rel'] = 'enclosure'
+ context.setdefault('links', []).append(FeedParserDict(attrsD))
+
def _start_source(self, attrsD):
+ if 'url' in attrsD:
+ # This means that we're processing a source element from an RSS 2.0 feed
+ self.sourcedata['href'] = attrsD['url']
+ self.push('source', 1)
self.insource = 1
+ self.title_depth = -1
def _end_source(self):
self.insource = 0
+ value = self.pop('source')
+ if value:
+ self.sourcedata['title'] = value
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
self.sourcedata.clear()
@@ -1302,9 +1690,6 @@ class _FeedParserMixin:
self.contentparams['src'] = src
self.push('content', 1)
- def _start_prodlink(self, attrsD):
- self.pushContent('content', attrsD, 'text/html', 1)
-
def _start_body(self, attrsD):
self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
_start_xhtml_body = _start_body
@@ -1314,45 +1699,95 @@ class _FeedParserMixin:
_start_fullitem = _start_content_encoded
def _end_content(self):
- copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
+ copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
value = self.popContent('content')
- if copyToDescription:
- self._save('description', value)
+ if copyToSummary:
+ self._save('summary', value)
+
_end_body = _end_content
_end_xhtml_body = _end_content
_end_content_encoded = _end_content
_end_fullitem = _end_content
- _end_prodlink = _end_content
def _start_itunes_image(self, attrsD):
self.push('itunes_image', 0)
- self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ if attrsD.get('href'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ elif attrsD.get('url'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
_start_itunes_link = _start_itunes_image
-
+
def _end_itunes_block(self):
value = self.pop('itunes_block', 0)
self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
def _end_itunes_explicit(self):
value = self.pop('itunes_explicit', 0)
- self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
+ # Convert 'yes' -> True, 'clean' to False, and any other value to None
+ # False and None both evaluate as False, so the difference can be ignored
+ # by applications that only need to know if the content is explicit.
+ self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
+
+ def _start_media_content(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_content', [])
+ context['media_content'].append(attrsD)
+
+ def _start_media_thumbnail(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_thumbnail', [])
+ self.push('url', 1) # new
+ context['media_thumbnail'].append(attrsD)
+
+ def _end_media_thumbnail(self):
+ url = self.pop('url')
+ context = self._getContext()
+ if url != None and len(url.strip()) != 0:
+ if 'url' not in context['media_thumbnail'][-1]:
+ context['media_thumbnail'][-1]['url'] = url
+
+ def _start_media_player(self, attrsD):
+ self.push('media_player', 0)
+ self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+ def _end_media_player(self):
+ value = self.pop('media_player')
+ context = self._getContext()
+ context['media_player']['content'] = value
+
+ def _start_newlocation(self, attrsD):
+ self.push('newlocation', 1)
+
+ def _end_newlocation(self):
+ url = self.pop('newlocation')
+ context = self._getContext()
+ # don't set newlocation if the context isn't right
+ if context is not self.feeddata:
+ return
+ context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
if _XML_AVAILABLE:
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
def __init__(self, baseuri, baselang, encoding):
- if _debug: sys.stderr.write('trying StrictFeedParser\n')
xml.sax.handler.ContentHandler.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
-
+ self.decls = {}
+
def startPrefixMapping(self, prefix, uri):
+ if not uri:
+ return
+ # Jython uses '' instead of None; standardize on None
+ prefix = prefix or None
self.trackNamespace(prefix, uri)
-
+ if prefix and uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:' + prefix] = uri
+
def startElementNS(self, name, qname, attrs):
namespace, localname = name
lowernamespace = str(namespace or '').lower()
- if lowernamespace.find('backend.userland.com/rss') <> -1:
+ if lowernamespace.find('backend.userland.com/rss') != -1:
# match any backend.userland.com namespace
namespace = 'http://backend.userland.com/rss'
lowernamespace = namespace
@@ -1361,12 +1796,9 @@ if _XML_AVAILABLE:
else:
givenprefix = None
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
- if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
- raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
- if prefix:
- localname = prefix + ':' + localname
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
+ raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix)
localname = str(localname).lower()
- if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
# qname implementation is horribly broken in Python 2.1 (it
# doesn't report any), and slightly broken in Python 2.2 (it
@@ -1375,8 +1807,21 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD = {}
- for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
+ attrsD, self.decls = self.decls, {}
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrsD['xmlns']=namespace
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrsD['xmlns']=namespace
+
+ if prefix:
+ localname = prefix.lower() + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in list(self.namespacesInUse.items()):
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+
+ for (namespace, attrlocalname), attrvalue in list(attrs.items()):
lowernamespace = (namespace or '').lower()
prefix = self._matchnamespaces.get(lowernamespace, '')
if prefix:
@@ -1384,7 +1829,7 @@ if _XML_AVAILABLE:
attrsD[str(attrlocalname).lower()] = attrvalue
for qname in attrs.getQNames():
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
- self.unknown_starttag(localname, attrsD.items())
+ self.unknown_starttag(localname, list(attrsD.items()))
def characters(self, text):
self.handle_data(text)
@@ -1399,26 +1844,39 @@ if _XML_AVAILABLE:
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if prefix:
localname = prefix + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in list(self.namespacesInUse.items()):
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
localname = str(localname).lower()
self.unknown_endtag(localname)
def error(self, exc):
self.bozo = 1
self.exc = exc
-
+
+ # drv_libxml2 calls warning() in some cases
+ warning = error
+
def fatalError(self, exc):
self.error(exc)
raise exc
class _BaseHTMLProcessor(sgmllib.SGMLParser):
- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
-
- def __init__(self, encoding):
+ special = re.compile('''[<>'"]''')
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+ elements_no_end_tag = set([
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ])
+
+ def __init__(self, encoding, _type):
self.encoding = encoding
- if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
+ self._type = _type
sgmllib.SGMLParser.__init__(self)
-
+
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
@@ -1429,80 +1887,132 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
return '<' + tag + ' />'
else:
return '<' + tag + '>' + tag + '>'
-
+
+ # By declaring these methods and overriding their compiled code
+ # with the code from sgmllib, the original code will execute in
+ # feedparser's scope instead of sgmllib's. This means that the
+ # `tagfind` and `charref` regular expressions will be found as
+ # they're declared above, not as they're declared in sgmllib.
+ def goahead(self, i):
+ pass
+ goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
+
+ def __parse_starttag(self, i):
+ pass
+ __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
+
+ def parse_starttag(self,i):
+ j = self.__parse_starttag(i)
+ if self._type == 'application/xhtml+xml':
+ if j>2 and self.rawdata[j-2:j]=='/>':
+ self.unknown_endtag(self.lasttag)
+ return j
+
def feed(self, data):
data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
- data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
+ data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
- if self.encoding and type(data) == type(u''):
- data = data.encode(self.encoding)
+ try:
+ bytes
+ if bytes is str:
+ raise NameError
+ self.encoding = self.encoding + '_INVALID_PYTHON_3'
+ except NameError:
+ if self.encoding and isinstance(data, str):
+ data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
+ sgmllib.SGMLParser.close(self)
def normalize_attrs(self, attrs):
+ if not attrs:
+ return attrs
# utility method to be called by descendants
- attrs = [(k.lower(), v) for k, v in attrs]
+ attrs = list(dict([(k.lower(), v) for k, v in attrs]).items())
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ attrs.sort()
return attrs
def unknown_starttag(self, tag, attrs):
# called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for
, tag='pre', attrs=[('class', 'screen')]
- if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
uattrs = []
- # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
- for key, value in attrs:
- if type(value) != type(u''):
- value = unicode(value, self.encoding)
- uattrs.append((unicode(key, self.encoding), value))
- strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
+ strattrs=''
+ if attrs:
+ for key, value in attrs:
+ value=value.replace('>','>').replace('<','<').replace('"','"')
+ value = self.bare_ampersand.sub("&", value)
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ if not isinstance(value, str):
+ value = value.decode(self.encoding, 'ignore')
+ try:
+ # Currently, in Python 3 the key is already a str, and cannot be decoded again
+ uattrs.append((str(key, self.encoding), value))
+ except TypeError:
+ uattrs.append((key, value))
+ strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs])
+ if self.encoding:
+ try:
+ strattrs = strattrs.encode(self.encoding)
+ except (UnicodeEncodeError, LookupError):
+ pass
if tag in self.elements_no_end_tag:
- self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+ self.pieces.append('<%s%s />' % (tag, strattrs))
else:
- self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+ self.pieces.append('<%s%s>' % (tag, strattrs))
def unknown_endtag(self, tag):
# called for each end tag, e.g. for
, tag will be 'pre'
# Reconstruct the original end tag.
if tag not in self.elements_no_end_tag:
- self.pieces.append("%(tag)s>" % locals())
+ self.pieces.append("%s>" % tag)
def handle_charref(self, ref):
# called for each character reference, e.g. for ' ', ref will be '160'
# Reconstruct the original character reference.
- self.pieces.append('%(ref)s;' % locals())
-
+ ref = ref.lower()
+ if ref.startswith('x'):
+ value = int(ref[1:], 16)
+ else:
+ value = int(ref)
+
+ if value in _cp1252:
+ self.pieces.append('%s;' % hex(ord(_cp1252[value]))[1:])
+ else:
+ self.pieces.append('%s;' % ref)
+
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
- self.pieces.append('&%(ref)s;' % locals())
+ if ref in name2codepoint or ref == 'apos':
+ self.pieces.append('&%s;' % ref)
+ else:
+ self.pieces.append('&%s' % ref)
def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
- if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
self.pieces.append(text)
-
+
def handle_comment(self, text):
# called for each HTML comment, e.g.
# Reconstruct the original comment.
- self.pieces.append('' % locals())
-
+ self.pieces.append('' % text)
+
def handle_pi(self, text):
# called for each processing instruction, e.g.
# Reconstruct original processing instruction.
- self.pieces.append('%(text)s>' % locals())
+ self.pieces.append('%s>' % text)
def handle_decl(self, text):
# called for the DOCTYPE, if present, e.g.
#
# Reconstruct original DOCTYPE
- self.pieces.append('' % locals())
-
+ self.pieces.append('' % text)
+
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
def _scan_name(self, i, declstartpos):
rawdata = self.rawdata
@@ -1521,36 +2031,497 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
# self.updatepos(declstartpos, i)
return None, -1
+ def convert_charref(self, name):
+ return '%s;' % name
+
+ def convert_entityref(self, name):
+ return '&%s;' % name
+
def output(self):
'''Return processed HTML as a single string'''
return ''.join([str(p) for p in self.pieces])
+ def parse_declaration(self, i):
+ try:
+ return sgmllib.SGMLParser.parse_declaration(self, i)
+ except sgmllib.SGMLParseError:
+ # escape the doctype declaration and continue parsing
+ self.handle_data('<')
+ return i+1
+
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
- def __init__(self, baseuri, baselang, encoding):
+ def __init__(self, baseuri, baselang, encoding, entities):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
+ self.entities=entities
def decodeEntities(self, element, data):
data = data.replace('<', '<')
data = data.replace('<', '<')
+ data = data.replace('<', '<')
data = data.replace('>', '>')
data = data.replace('>', '>')
+ data = data.replace('>', '>')
data = data.replace('&', '&')
data = data.replace('&', '&')
data = data.replace('"', '"')
data = data.replace('"', '"')
data = data.replace(''', ''')
data = data.replace(''', ''')
- if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
+ if not self.contentparams.get('type', 'xml').endswith('xml'):
data = data.replace('<', '<')
data = data.replace('>', '>')
data = data.replace('&', '&')
data = data.replace('"', '"')
data = data.replace(''', "'")
return data
-
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
+
+class _MicroformatsParser:
+ STRING = 1
+ DATE = 2
+ URI = 3
+ NODE = 4
+ EMAIL = 5
+
+ known_xfn_relationships = set(['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'])
+ known_binary_extensions = set(['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'])
+
+ def __init__(self, data, baseuri, encoding):
+ self.document = BeautifulSoup.BeautifulSoup(data)
+ self.baseuri = baseuri
+ self.encoding = encoding
+ if isinstance(data, str):
+ data = data.encode(encoding)
+ self.tags = []
+ self.enclosures = []
+ self.xfn = []
+ self.vcard = None
+
+ def vcardEscape(self, s):
+ if isinstance(s, str):
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
+ return s
+
+ def vcardFold(self, s):
+ s = re.sub(';+$', '', s)
+ sFolded = ''
+ iMax = 75
+ sPrefix = ''
+ while len(s) > iMax:
+ sFolded += sPrefix + s[:iMax] + '\n'
+ s = s[iMax:]
+ sPrefix = ' '
+ iMax = 74
+ sFolded += sPrefix + s
+ return sFolded
+
+ def normalize(self, s):
+ return re.sub(r'\s+', ' ', s).strip()
+
+ def unique(self, aList):
+ results = []
+ for element in aList:
+ if element not in results:
+ results.append(element)
+ return results
+
+ def toISO8601(self, dt):
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
+
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
+ all = lambda x: 1
+ sProperty = sProperty.lower()
+ bFound = 0
+ bNormalize = 1
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
+ if bAllowMultiple and (iPropertyType != self.NODE):
+ snapResults = []
+ containers = elmRoot(['ul', 'ol'], propertyMatch)
+ for container in containers:
+ snapResults.extend(container('li'))
+ bFound = (len(snapResults) != 0)
+ if not bFound:
+ snapResults = elmRoot(all, propertyMatch)
+ bFound = (len(snapResults) != 0)
+ if (not bFound) and (sProperty == 'value'):
+ snapResults = elmRoot('pre')
+ bFound = (len(snapResults) != 0)
+ bNormalize = not bFound
+ if not bFound:
+ snapResults = [elmRoot]
+ bFound = (len(snapResults) != 0)
+ arFilter = []
+ if sProperty == 'vcard':
+ snapFilter = elmRoot(all, propertyMatch)
+ for node in snapFilter:
+ if node.findParent(all, propertyMatch):
+ arFilter.append(node)
+ arResults = []
+ for node in snapResults:
+ if node not in arFilter:
+ arResults.append(node)
+ bFound = (len(arResults) != 0)
+ if not bFound:
+ if bAllowMultiple:
+ return []
+ elif iPropertyType == self.STRING:
+ return ''
+ elif iPropertyType == self.DATE:
+ return None
+ elif iPropertyType == self.URI:
+ return ''
+ elif iPropertyType == self.NODE:
+ return None
+ else:
+ return None
+ arValues = []
+ for elmResult in arResults:
+ sValue = None
+ if iPropertyType == self.NODE:
+ if bAllowMultiple:
+ arValues.append(elmResult)
+ continue
+ else:
+ return elmResult
+ sNodeName = elmResult.name.lower()
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'abbr'):
+ sValue = elmResult.get('title')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (iPropertyType == self.URI):
+ if sNodeName == 'a':
+ sValue = elmResult.get('href')
+ elif sNodeName == 'img':
+ sValue = elmResult.get('src')
+ elif sNodeName == 'object':
+ sValue = elmResult.get('data')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'img'):
+ sValue = elmResult.get('alt')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ sValue = elmResult.renderContents()
+ sValue = re.sub(r'<\S[^>]*>', '', sValue)
+ sValue = sValue.replace('\r\n', '\n')
+ sValue = sValue.replace('\r', '\n')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ continue
+ if iPropertyType == self.DATE:
+ sValue = _parse_date_iso8601(sValue)
+ if bAllowMultiple:
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
+ else:
+ return bAutoEscape and self.vcardEscape(sValue) or sValue
+ return arValues
+
+ def findVCards(self, elmRoot, bAgentParsing=0):
+ sVCards = ''
+
+ if not bAgentParsing:
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
+ else:
+ arCards = [elmRoot]
+
+ for elmCard in arCards:
+ arLines = []
+
+ def processSingleString(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
+ return sValue or ''
+
+ def processSingleURI(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
+ if sValue:
+ sContentType = ''
+ sEncoding = ''
+ sValueKey = ''
+ if sValue.startswith('data:'):
+ sEncoding = ';ENCODING=b'
+ sContentType = sValue.split(';')[0].split('/').pop()
+ sValue = sValue.split(',', 1).pop()
+ else:
+ elmValue = self.getPropertyValue(elmCard, sProperty)
+ if elmValue:
+ if sProperty != 'url':
+ sValueKey = ';VALUE=uri'
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
+ sContentType = sContentType.upper()
+ if sContentType == 'OCTET-STREAM':
+ sContentType = ''
+ if sContentType:
+ sContentType = ';TYPE=' + sContentType.upper()
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
+
+ def processTypeValue(sProperty, arDefaultType, arForceType=None):
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
+ for elmResult in arResults:
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
+ if arForceType:
+ arType = self.unique(arForceType + arType)
+ if not arType:
+ arType = arDefaultType
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
+
+ # AGENT
+ # must do this before all other properties because it is destructive
+ # (removes nested class="vcard" nodes so they don't interfere with
+ # this vcard's other properties)
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
+ for elmAgent in arAgent:
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n'
+ sAgentValue = sAgentValue.replace('\n', '\\n')
+ sAgentValue = sAgentValue.replace(';', '\\;')
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
+ # Completely remove the agent element from the parse tree
+ elmAgent.extract()
+ else:
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
+
+ # FN (full name)
+ sFN = processSingleString('fn')
+
+ # N (name)
+ elmName = self.getPropertyValue(elmCard, 'n')
+ if elmName:
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
+ sGivenName + ';' +
+ ','.join(arAdditionalNames) + ';' +
+ ','.join(arHonorificPrefixes) + ';' +
+ ','.join(arHonorificSuffixes)))
+ elif sFN:
+ # implied "N" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
+ arNames = self.normalize(sFN).split()
+ if len(arNames) == 2:
+ bFamilyNameFirst = (arNames[0].endswith(',') or
+ len(arNames[1]) == 1 or
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
+ if bFamilyNameFirst:
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
+ else:
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
+
+ # SORT-STRING
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
+ if sSortString:
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
+
+ # NICKNAME
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
+ if arNickname:
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
+
+ # PHOTO
+ processSingleURI('photo')
+
+ # BDAY
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
+ if dtBday:
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
+
+ # ADR (address)
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
+ for elmAdr in arAdr:
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
+ if not arType:
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
+ sPostOfficeBox + ';' +
+ sExtendedAddress + ';' +
+ sStreetAddress + ';' +
+ sLocality + ';' +
+ sRegion + ';' +
+ sPostalCode + ';' +
+ sCountryName))
+
+ # LABEL
+ processTypeValue('label', ['intl','postal','parcel','work'])
+
+ # TEL (phone number)
+ processTypeValue('tel', ['voice'])
+
+ # EMAIL
+ processTypeValue('email', ['internet'], ['internet'])
+
+ # MAILER
+ processSingleString('mailer')
+
+ # TZ (timezone)
+ processSingleString('tz')
+
+ # GEO (geographical information)
+ elmGeo = self.getPropertyValue(elmCard, 'geo')
+ if elmGeo:
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
+
+ # TITLE
+ processSingleString('title')
+
+ # ROLE
+ processSingleString('role')
+
+ # LOGO
+ processSingleURI('logo')
+
+ # ORG (organization)
+ elmOrg = self.getPropertyValue(elmCard, 'org')
+ if elmOrg:
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
+ if not sOrganizationName:
+ # implied "organization-name" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
+ if sOrganizationName:
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
+ else:
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
+
+ # CATEGORY
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
+ if arCategory:
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
+
+ # NOTE
+ processSingleString('note')
+
+ # REV
+ processSingleString('rev')
+
+ # SOUND
+ processSingleURI('sound')
+
+ # UID
+ processSingleString('uid')
+
+ # URL
+ processSingleURI('url')
+
+ # CLASS
+ processSingleString('class')
+
+ # KEY
+ processSingleURI('key')
+
+ if arLines:
+ arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard']
+ # XXX - this is super ugly; properly fix this with issue 148
+ for i, s in enumerate(arLines):
+ if not isinstance(s, str):
+ arLines[i] = s.decode('utf-8', 'ignore')
+ sVCards += '\n'.join(arLines) + '\n'
+
+ return sVCards.strip()
+
+ def isProbablyDownloadable(self, elm):
+ attrsD = elm.attrMap
+ if 'href' not in attrsD:
+ return 0
+ linktype = attrsD.get('type', '').strip()
+ if linktype.startswith('audio/') or \
+ linktype.startswith('video/') or \
+ (linktype.startswith('application/') and not linktype.endswith('xml')):
+ return 1
+ try:
+ path = urllib.parse.urlparse(attrsD['href'])[2]
+ except ValueError:
+ return 0
+ if path.find('.') == -1:
+ return 0
+ fileext = path.split('.').pop().lower()
+ return fileext in self.known_binary_extensions
+
+ def findTags(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
+ href = elm.get('href')
+ if not href:
+ continue
+ urlscheme, domain, path, params, query, fragment = \
+ urllib.parse.urlparse(_urljoin(self.baseuri, href))
+ segments = path.split('/')
+ tag = segments.pop()
+ if not tag:
+ if segments:
+ tag = segments.pop()
+ else:
+ # there are no tags
+ continue
+ tagscheme = urllib.parse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
+ if not tagscheme.endswith('/'):
+ tagscheme += '/'
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
+
+ def findEnclosures(self):
+ all = lambda x: 1
+ enclosure_match = re.compile(r'\benclosure\b')
+ for elm in self.document(all, {'href': re.compile(r'.+')}):
+ if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm):
+ continue
+ if elm.attrMap not in self.enclosures:
+ self.enclosures.append(elm.attrMap)
+ if elm.string and not elm.get('title'):
+ self.enclosures[-1]['title'] = elm.string
+
+ def findXFN(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
+ rels = elm.get('rel', '').split()
+ xfn_rels = [r for r in rels if r in self.known_xfn_relationships]
+ if xfn_rels:
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
+
+def _parseMicroformats(htmlSource, baseURI, encoding):
+ if not BeautifulSoup:
+ return
+ try:
+ p = _MicroformatsParser(htmlSource, baseURI, encoding)
+ except UnicodeEncodeError:
+ # sgmllib throws this exception when performing lookups of tags
+ # with non-ASCII characters in them.
+ return
+ p.vcard = p.findVCards(p.document)
+ p.findTags()
+ p.findEnclosures()
+ p.findXFN()
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
+
class _RelativeURIResolver(_BaseHTMLProcessor):
- relative_uris = [('a', 'href'),
+ relative_uris = set([('a', 'href'),
('applet', 'codebase'),
('area', 'href'),
('blockquote', 'cite'),
@@ -1574,67 +2545,259 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
('object', 'data'),
('object', 'usemap'),
('q', 'cite'),
- ('script', 'src')]
+ ('script', 'src'),
+ ('video', 'poster')])
- def __init__(self, baseuri, encoding):
- _BaseHTMLProcessor.__init__(self, encoding)
+ def __init__(self, baseuri, encoding, _type):
+ _BaseHTMLProcessor.__init__(self, encoding, _type)
self.baseuri = baseuri
def resolveURI(self, uri):
- return _urljoin(self.baseuri, uri)
-
+ return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
+
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-
-def _resolveRelativeURIs(htmlSource, baseURI, encoding):
- if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
- p = _RelativeURIResolver(baseURI, encoding)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+
+ p = _RelativeURIResolver(baseURI, encoding, _type)
p.feed(htmlSource)
return p.output()
+def _makeSafeAbsoluteURI(base, rel=None):
+ # bail if ACCEPTABLE_URI_SCHEMES is empty
+ if not ACCEPTABLE_URI_SCHEMES:
+ try:
+ return _urljoin(base, rel or '')
+ except ValueError:
+ return ''
+ if not base:
+ return rel or ''
+ if not rel:
+ try:
+ scheme = urllib.parse.urlparse(base)[0]
+ except ValueError:
+ return ''
+ if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+ return base
+ return ''
+ try:
+ uri = _urljoin(base, rel)
+ except ValueError:
+ return ''
+ if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+ return ''
+ return uri
+
class _HTMLSanitizer(_BaseHTMLProcessor):
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
- 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
- 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
- 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
- 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
- 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
- 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
- 'thead', 'tr', 'tt', 'u', 'ul', 'var']
+ acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
- 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
- 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
- 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
- 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
- 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
- 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
- 'usemap', 'valign', 'value', 'vspace', 'width']
+ acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
+ 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
+ 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
+ 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
+ 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
+ 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
+ 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
+ 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
+ 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
+ 'width', 'wrap', 'xml:lang'])
- unacceptable_elements_with_end_tag = ['script', 'applet']
+ unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
+
+ acceptable_css_properties = set(['azimuth', 'background-color',
+ 'border-bottom-color', 'border-collapse', 'border-color',
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+ 'white-space', 'width'])
+
+ # survey of common keywords found in feeds
+ acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+ 'transparent', 'underline', 'white', 'yellow'])
+
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
+
+ mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math',
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+ 'munderover', 'none', 'semantics'])
+
+ mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign',
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'])
+
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
+ svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use'])
+
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
+ svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
+ 'overline-position', 'overline-thickness', 'panose-1', 'path',
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
+ 'y2', 'zoomAndPan'])
+
+ svg_attr_map = None
+ svg_elem_map = None
+
+ acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-opacity'])
def reset(self):
_BaseHTMLProcessor.reset(self)
self.unacceptablestack = 0
-
+ self.mathmlOK = 0
+ self.svgOK = 0
+
def unknown_starttag(self, tag, attrs):
- if not tag in self.acceptable_elements:
+ acceptable_attributes = self.acceptable_attributes
+ keymap = {}
+ if not tag in self.acceptable_elements or self.svgOK:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
- return
- attrs = self.normalize_attrs(attrs)
- attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
- _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-
+
+ # add implicit namespaces to html5 inline svg/mathml
+ if self._type.endswith('html'):
+ if not dict(attrs).get('xmlns'):
+ if tag=='svg':
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
+ # not otherwise acceptable, perhaps it is MathML or SVG?
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
+ self.mathmlOK += 1
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
+ self.svgOK += 1
+
+ # chose acceptable attributes based on tag class, else bail
+ if self.mathmlOK and tag in self.mathml_elements:
+ acceptable_attributes = self.mathml_attributes
+ elif self.svgOK and tag in self.svg_elements:
+ # for most vocabularies, lowercasing is a good idea. Many
+ # svg elements, however, are camel case
+ if not self.svg_attr_map:
+ lower=[attr.lower() for attr in self.svg_attributes]
+ mix=[a for a in self.svg_attributes if a not in lower]
+ self.svg_attributes = lower
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix])
+
+ lower=[attr.lower() for attr in self.svg_elements]
+ mix=[a for a in self.svg_elements if a not in lower]
+ self.svg_elements = lower
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix])
+ acceptable_attributes = self.svg_attributes
+ tag = self.svg_elem_map.get(tag,tag)
+ keymap = self.svg_attr_map
+ elif not tag in self.acceptable_elements:
+ return
+
+ # declare xlink namespace, if needed
+ if self.mathmlOK or self.svgOK:
+ if [n_v for n_v in attrs if n_v[0].startswith('xlink:')]:
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
+
+ clean_attrs = []
+ for key, value in self.normalize_attrs(attrs):
+ if key in acceptable_attributes:
+ key=keymap.get(key,key)
+ # make sure the uri uses an acceptable uri scheme
+ if key == 'href':
+ value = _makeSafeAbsoluteURI(value)
+ clean_attrs.append((key,value))
+ elif key=='style':
+ clean_value = self.sanitize_style(value)
+ if clean_value:
+ clean_attrs.append((key,clean_value))
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
+
def unknown_endtag(self, tag):
if not tag in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack -= 1
- return
+ if self.mathmlOK and tag in self.mathml_elements:
+ if tag == 'math' and self.mathmlOK:
+ self.mathmlOK -= 1
+ elif self.svgOK and tag in self.svg_elements:
+ tag = self.svg_elem_map.get(tag,tag)
+ if tag == 'svg' and self.svgOK:
+ self.svgOK -= 1
+ else:
+ return
_BaseHTMLProcessor.unknown_endtag(self, tag)
def handle_pi(self, text):
@@ -1647,8 +2810,53 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
if not self.unacceptablestack:
_BaseHTMLProcessor.handle_data(self, text)
-def _sanitizeHTML(htmlSource, encoding):
- p = _HTMLSanitizer(encoding)
+ def sanitize_style(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+ if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
+ return ''
+
+ clean = []
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value:
+ continue
+ if prop.lower() in self.acceptable_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+ for keyword in value.split():
+ if not keyword in self.acceptable_css_keywords and \
+ not self.valid_css_values.match(keyword):
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+ def parse_comment(self, i, report=1):
+ ret = _BaseHTMLProcessor.parse_comment(self, i, report)
+ if ret >= 0:
+ return ret
+ # if ret == -1, this may be a malicious attempt to circumvent
+ # sanitization, or a page-destroying unclosed comment
+ match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
+ if match:
+ return match.end()
+ # unclosed comment; deliberately fail to handle_data()
+ return len(self.rawdata)
+
+
+def _sanitizeHTML(htmlSource, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+ p = _HTMLSanitizer(encoding, _type)
+ htmlSource = htmlSource.replace(''):
@@ -1686,61 +2894,50 @@ def _sanitizeHTML(htmlSource, encoding):
data = data.strip().replace('\r\n', '\n')
return data
-class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers):
- if ((code / 100) == 3) and (code != 304):
- return self.http_error_302(req, fp, code, msg, headers)
- infourl = urllib.addinfourl(fp, headers, req.get_full_url())
- infourl.status = code
- return infourl
+ # The default implementation just raises HTTPError.
+ # Forget that.
+ fp.status = code
+ return fp
- def http_error_302(self, req, fp, code, msg, headers):
- if headers.dict.has_key('location'):
- infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
- else:
- infourl = urllib.addinfourl(fp, headers, req.get_full_url())
- if not hasattr(infourl, 'status'):
- infourl.status = code
- return infourl
+ def http_error_301(self, req, fp, code, msg, hdrs):
+ result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp,
+ code, msg, hdrs)
+ result.status = code
+ result.newurl = result.geturl()
+ return result
+ # The default implementations in urllib2.HTTPRedirectHandler
+ # are identical, so hardcoding a http_error_301 call above
+ # won't affect anything
+ http_error_300 = http_error_301
+ http_error_302 = http_error_301
+ http_error_303 = http_error_301
+ http_error_307 = http_error_301
- def http_error_301(self, req, fp, code, msg, headers):
- if headers.dict.has_key('location'):
- infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
- else:
- infourl = urllib.addinfourl(fp, headers, req.get_full_url())
- if not hasattr(infourl, 'status'):
- infourl.status = code
- return infourl
-
- http_error_300 = http_error_302
- http_error_303 = http_error_302
- http_error_307 = http_error_302
-
def http_error_401(self, req, fp, code, msg, headers):
# Check if
# - server requires digest auth, AND
# - we tried (unsuccessfully) with basic auth, AND
- # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
# If all conditions hold, parse authentication information
# out of the Authorization header we sent the first time
# (for the username and password) and the WWW-Authenticate
# header the server sent back (for the realm) and retry
# the request with the appropriate digest auth headers instead.
# This evil genius hack has been brought to you by Aaron Swartz.
- host = urlparse.urlparse(req.get_full_url())[1]
- try:
- assert sys.version.split()[0] >= '2.3.3'
- assert base64 != None
- user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
- realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
- self.add_password(realm, host, user, passw)
- retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
- self.reset_retry_count()
- return retry
- except:
+ host = urllib.parse.urlparse(req.get_full_url())[1]
+ if base64 is None or 'Authorization' not in req.headers \
+ or 'WWW-Authenticate' not in headers:
return self.http_error_default(req, fp, code, msg, headers)
+ auth = _base64decode(req.headers['Authorization'].split(' ')[1])
+ user, passw = auth.split(':')
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+ self.add_password(realm, host, user, passw)
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+ self.reset_retry_count()
+ return retry
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
"""URL, filename, or string --> stream
This function lets you define parsers that take any input source
@@ -1752,10 +2949,12 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
If the etag argument is supplied, it will be used as the value of an
If-None-Match request header.
- If the modified argument is supplied, it must be a tuple of 9 integers
- as returned by gmtime() in the standard Python time module. This MUST
- be in GMT (Greenwich Mean Time). The formatted date/time will be used
- as the value of an If-Modified-Since request header.
+ If the modified argument is supplied, it can be a tuple of 9 integers
+ (as returned by gmtime() in the standard Python time module) or a date
+ string in any format supported by feedparser. Regardless, it MUST
+ be in GMT (Greenwich Mean Time). It will be reformatted into an
+ RFC 1123-compliant date and used as the value of an If-Modified-Since
+ request header.
If the agent argument is supplied, it will be used as the value of a
User-Agent request header.
@@ -1765,76 +2964,132 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
If handlers is supplied, it is a list of handlers used to build a
urllib2 opener.
+
+ if request_headers is supplied it is a dictionary of HTTP request headers
+ that will override the values generated by FeedParser.
"""
if hasattr(url_file_stream_or_string, 'read'):
return url_file_stream_or_string
- if url_file_stream_or_string == '-':
- return sys.stdin
-
- if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
+ if isinstance(url_file_stream_or_string, str) \
+ and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+ # Deal with the feed URI scheme
+ if url_file_stream_or_string.startswith('feed:http'):
+ url_file_stream_or_string = url_file_stream_or_string[5:]
+ elif url_file_stream_or_string.startswith('feed:'):
+ url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
if not agent:
agent = USER_AGENT
- # test for inline user:password for basic auth
+ # Test for inline user:password credentials for HTTP basic auth
auth = None
- if base64:
- urltype, rest = urllib.splittype(url_file_stream_or_string)
- realhost, rest = urllib.splithost(rest)
+ if base64 and not url_file_stream_or_string.startswith('ftp:'):
+ urltype, rest = urllib.parse.splittype(url_file_stream_or_string)
+ realhost, rest = urllib.parse.splithost(rest)
if realhost:
- user_passwd, realhost = urllib.splituser(realhost)
+ user_passwd, realhost = urllib.parse.splituser(realhost)
if user_passwd:
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
- auth = base64.encodestring(user_passwd).strip()
+ auth = base64.standard_b64encode(user_passwd).strip()
+
+ # iri support
+ if isinstance(url_file_stream_or_string, str):
+ url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
+
# try to open with urllib2 (to use optional headers)
- request = urllib2.Request(url_file_stream_or_string)
- request.add_header('User-Agent', agent)
- if etag:
- request.add_header('If-None-Match', etag)
- if modified:
- # format into an RFC 1123-compliant timestamp. We can't use
- # time.strftime() since the %a and %b directives can be affected
- # by the current locale, but RFC 2616 states that dates must be
- # in English.
- short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
- request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
- if referrer:
- request.add_header('Referer', referrer)
- if gzip and zlib:
- request.add_header('Accept-encoding', 'gzip, deflate')
- elif gzip:
- request.add_header('Accept-encoding', 'gzip')
- elif zlib:
- request.add_header('Accept-encoding', 'deflate')
- else:
- request.add_header('Accept-encoding', '')
- if auth:
- request.add_header('Authorization', 'Basic %s' % auth)
- if ACCEPT_HEADER:
- request.add_header('Accept', ACCEPT_HEADER)
- request.add_header('A-IM', 'feed') # RFC 3229 support
- opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
+ request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
+ opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
try:
return opener.open(request)
finally:
opener.close() # JohnD
-
+
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
- return open(url_file_stream_or_string)
- except:
+ return open(url_file_stream_or_string, 'rb')
+ except (IOError, UnicodeEncodeError, TypeError):
+ # if url_file_stream_or_string is a unicode object that
+ # cannot be converted to the encoding returned by
+ # sys.getfilesystemencoding(), a UnicodeEncodeError
+ # will be thrown
+ # If url_file_stream_or_string is a string that contains NULL
+ # (such as an XML document encoded in UTF-32), TypeError will
+ # be thrown.
pass
# treat url_file_stream_or_string as string
- return _StringIO(str(url_file_stream_or_string))
+ if isinstance(url_file_stream_or_string, str):
+ return _StringIO(url_file_stream_or_string.encode('utf-8'))
+ return _StringIO(url_file_stream_or_string)
+
+def _convert_to_idn(url):
+ """Convert a URL to IDN notation"""
+ # this function should only be called with a unicode string
+ # strategy: if the host cannot be encoded in ascii, then
+ # it'll be necessary to encode it in idn form
+ parts = list(urllib.parse.urlsplit(url))
+ try:
+ parts[1].encode('ascii')
+ except UnicodeEncodeError:
+ # the url needs to be converted to idn notation
+ host = parts[1].rsplit(':', 1)
+ newhost = []
+ port = ''
+ if len(host) == 2:
+ port = host.pop()
+ for h in host[0].split('.'):
+ newhost.append(h.encode('idna').decode('utf-8'))
+ parts[1] = '.'.join(newhost)
+ if port:
+ parts[1] += ':' + port
+ return urllib.parse.urlunsplit(parts)
+ else:
+ return url
+
+def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
+ request = urllib.request.Request(url)
+ request.add_header('User-Agent', agent)
+ if etag:
+ request.add_header('If-None-Match', etag)
+ if isinstance(modified, str):
+ modified = _parse_date(modified)
+ elif isinstance(modified, datetime.datetime):
+ modified = modified.utctimetuple()
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header('Referer', referrer)
+ if gzip and zlib:
+ request.add_header('Accept-encoding', 'gzip, deflate')
+ elif gzip:
+ request.add_header('Accept-encoding', 'gzip')
+ elif zlib:
+ request.add_header('Accept-encoding', 'deflate')
+ else:
+ request.add_header('Accept-encoding', '')
+ if auth:
+ request.add_header('Authorization', 'Basic %s' % auth)
+ if ACCEPT_HEADER:
+ request.add_header('Accept', ACCEPT_HEADER)
+ # use this for whatever -- cookies, special headers, etc
+ # [('Cookie','Something'),('x-special-header','Another Value')]
+ for header_name, header_value in list(request_headers.items()):
+ request.add_header(header_name, header_value)
+ request.add_header('A-IM', 'feed') # RFC 3229 support
+ return request
_date_handlers = []
def registerDateHandler(func):
'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
_date_handlers.insert(0, func)
-
+
# ISO-8601 date parsing routines written by Fazal Majid.
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
# parser is beyond the scope of feedparser and would be a worthwhile addition
@@ -1844,8 +3099,8 @@ def registerDateHandler(func):
# 0301-04-01), so we use templates instead.
# Please note the order in templates is significant because we need a
# greedy match.
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
- 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
'-YY-?MM', '-OOO', '-YY',
'--MM-?DD', '--MM',
'---DD',
@@ -1860,19 +3115,29 @@ _iso8601_re = [
'CC', r'(?P
\d\d$)')
+ r'(T?(?P\d{2}):(?P\d{2})'
+ r'(:(?P\d{2}))?'
+ + r'(\.(?P\d+))?'
+ r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?'
for tmpl in _iso8601_tmpl]
-del tmpl
+try:
+ del tmpl
+except NameError:
+ pass
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
-del regex
+try:
+ del regex
+except NameError:
+ pass
def _parse_date_iso8601(dateString):
'''Parse a variety of ISO-8601-compatible formats like 20040105'''
m = None
for _iso8601_match in _iso8601_matches:
m = _iso8601_match(dateString)
- if m: break
- if not m: return
- if m.span() == (0, 0): return
+ if m:
+ break
+ if not m:
+ return
+ if m.span() == (0, 0):
+ return
params = m.groupdict()
ordinal = params.get('ordinal', 0)
if ordinal:
@@ -1910,7 +3175,7 @@ def _parse_date_iso8601(dateString):
day = int(day)
# special case of the century - is the first year of the 21st century
# 2000 or 2001 ? The debate goes on...
- if 'century' in params.keys():
+ if 'century' in params:
year = (int(params['century']) - 1) * 100 + 1
# in ISO 8601 most fields are optional
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
@@ -1918,14 +3183,10 @@ def _parse_date_iso8601(dateString):
params[field] = 0
hour = int(params.get('hour', 0))
minute = int(params.get('minute', 0))
- second = int(params.get('second', 0))
+ second = int(float(params.get('second', 0)))
# weekday is normalized by mktime(), we can ignore it
weekday = 0
- # daylight savings is complex, but not needed for feedparser's purposes
- # as time zones, if specified, include mention of whether it is active
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
- # and most implementations have DST bugs
- daylight_savings_flag = 0
+ daylight_savings_flag = -1
tm = [year, month, day, hour, minute, second, weekday,
ordinal, daylight_savings_flag]
# ISO 8601 time zone adjustments
@@ -1942,38 +3203,39 @@ def _parse_date_iso8601(dateString):
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
# which is guaranteed to normalize d/m/y/h/m/s.
# Many implementations have bugs, but we'll pretend they don't.
- return time.localtime(time.mktime(tm))
+ return time.localtime(time.mktime(tuple(tm)))
registerDateHandler(_parse_date_iso8601)
-
+
# 8-bit date handling routines written by ytrewq1.
-_korean_year = u'\ub144' # b3e2 in euc-kr
-_korean_month = u'\uc6d4' # bff9 in euc-kr
-_korean_day = u'\uc77c' # c0cf in euc-kr
-_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
-_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
+_korean_year = '\ub144' # b3e2 in euc-kr
+_korean_month = '\uc6d4' # bff9 in euc-kr
+_korean_day = '\uc77c' # c0cf in euc-kr
+_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr
+_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr
_korean_onblog_date_re = \
re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
(_korean_year, _korean_month, _korean_day))
_korean_nate_date_re = \
- re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
(_korean_am, _korean_pm))
def _parse_date_onblog(dateString):
'''Parse a string according to the OnBlog 8-bit date format'''
m = _korean_onblog_date_re.match(dateString)
- if not m: return
+ if not m:
+ return
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
'zonediff': '+09:00'}
- if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_onblog)
def _parse_date_nate(dateString):
'''Parse a string according to the Nate 8-bit date format'''
m = _korean_nate_date_re.match(dateString)
- if not m: return
+ if not m:
+ return
hour = int(m.group(5))
ampm = m.group(4)
if (ampm == _korean_pm):
@@ -1985,118 +3247,97 @@ def _parse_date_nate(dateString):
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
'zonediff': '+09:00'}
- if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_nate)
-_mssql_date_re = \
- re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
-def _parse_date_mssql(dateString):
- '''Parse a string according to the MS SQL date format'''
- m = _mssql_date_re.match(dateString)
- if not m: return
- w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
- {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
- 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
- 'zonediff': '+09:00'}
- if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
- return _parse_date_w3dtf(w3dtfdate)
-registerDateHandler(_parse_date_mssql)
-
# Unicode strings for Greek date strings
_greek_months = \
{ \
- u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
- u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
- u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
- u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
- u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
- u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
- u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
- u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
- u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
- u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
- u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
- u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
- u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
- u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
- u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
- u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
- u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
- u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
- u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
+ '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7
+ '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7
+ '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7
+ '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7
+ '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7
+ '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7
+ '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7
+ '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7
+ '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7
+ '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7
+ '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7
+ '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7
+ '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7
+ '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7
+ '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7
+ '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7
+ '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7
+ '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7
+ '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7
}
_greek_wdays = \
{ \
- u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
- u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
- u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
- u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
- u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
- u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
- u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
+ '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7
+ '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7
+ '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7
+ '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7
+ '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7
+ '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7
+ '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7
}
_greek_date_format_re = \
- re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
+ re.compile('([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
def _parse_date_greek(dateString):
'''Parse a string according to a Greek 8-bit date format.'''
m = _greek_date_format_re.match(dateString)
- if not m: return
- try:
- wday = _greek_wdays[m.group(1)]
- month = _greek_months[m.group(3)]
- except:
+ if not m:
return
+ wday = _greek_wdays[m.group(1)]
+ month = _greek_months[m.group(3)]
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
'zonediff': m.group(8)}
- if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
return _parse_date_rfc822(rfc822date)
registerDateHandler(_parse_date_greek)
# Unicode strings for Hungarian date strings
_hungarian_months = \
{ \
- u'janu\u00e1r': u'01', # e1 in iso-8859-2
- u'febru\u00e1ri': u'02', # e1 in iso-8859-2
- u'm\u00e1rcius': u'03', # e1 in iso-8859-2
- u'\u00e1prilis': u'04', # e1 in iso-8859-2
- u'm\u00e1ujus': u'05', # e1 in iso-8859-2
- u'j\u00fanius': u'06', # fa in iso-8859-2
- u'j\u00falius': u'07', # fa in iso-8859-2
- u'augusztus': u'08',
- u'szeptember': u'09',
- u'okt\u00f3ber': u'10', # f3 in iso-8859-2
- u'november': u'11',
- u'december': u'12',
+ 'janu\u00e1r': '01', # e1 in iso-8859-2
+ 'febru\u00e1ri': '02', # e1 in iso-8859-2
+ 'm\u00e1rcius': '03', # e1 in iso-8859-2
+ '\u00e1prilis': '04', # e1 in iso-8859-2
+ 'm\u00e1ujus': '05', # e1 in iso-8859-2
+ 'j\u00fanius': '06', # fa in iso-8859-2
+ 'j\u00falius': '07', # fa in iso-8859-2
+ 'augusztus': '08',
+ 'szeptember': '09',
+ 'okt\u00f3ber': '10', # f3 in iso-8859-2
+ 'november': '11',
+ 'december': '12',
}
_hungarian_date_format_re = \
- re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
+ re.compile('(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
def _parse_date_hungarian(dateString):
'''Parse a string according to a Hungarian 8-bit date format.'''
m = _hungarian_date_format_re.match(dateString)
- if not m: return
- try:
- month = _hungarian_months[m.group(2)]
- day = m.group(3)
- if len(day) == 1:
- day = '0' + day
- hour = m.group(4)
- if len(hour) == 1:
- hour = '0' + hour
- except:
- return
+ if not m or m.group(2) not in _hungarian_months:
+ return None
+ month = _hungarian_months[m.group(2)]
+ day = m.group(3)
+ if len(day) == 1:
+ day = '0' + day
+ hour = m.group(4)
+ if len(hour) == 1:
+ hour = '0' + hour
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
{'year': m.group(1), 'month': month, 'day': day,\
'hour': hour, 'minute': m.group(5),\
'zonediff': m.group(6)}
- if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_hungarian)
@@ -2104,6 +3345,9 @@ registerDateHandler(_parse_date_hungarian)
# Drake and licensed under the Python license. Removed all range checking
# for month, day, hour, minute, and second, since mktime will normalize
# these later
+# Modified to also support MSSQL-style datetimes as defined at:
+# http://msdn.microsoft.com/en-us/library/ms186724.aspx
+# (which basically means allowing a space as a date/time/timezone separator)
def _parse_date_w3dtf(dateString):
def __extract_date(m):
year = int(m.group('year'))
@@ -2129,7 +3373,7 @@ def _parse_date_w3dtf(dateString):
day = 31
elif jday < julian:
if day + diff < 28:
- day = day + diff
+ day = day + diff
else:
month = month + 1
return year, month, day
@@ -2183,414 +3427,558 @@ def _parse_date_w3dtf(dateString):
__date_re = ('(?P\d\d\d\d)'
'(?:(?P-|)'
- '(?:(?P\d\d\d)'
- '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?')
- __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)'
- __tzd_rx = re.compile(__tzd_re)
+ '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?'
+ '|(?P\d\d\d)))?')
+ __tzd_re = ' ?(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)?'
__time_re = ('(?P\d\d)(?P:|)(?P\d\d)'
- '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?'
+ '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?'
+ __tzd_re)
- __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
+ __datetime_re = '%s(?:[T ]%s)?' % (__date_re, __time_re)
__datetime_rx = re.compile(__datetime_re)
m = __datetime_rx.match(dateString)
- if (m is None) or (m.group() != dateString): return
+ if (m is None) or (m.group() != dateString):
+ return
gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
- if gmt[0] == 0: return
+ if gmt[0] == 0:
+ return
return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
registerDateHandler(_parse_date_w3dtf)
-def _parse_date_rfc822(dateString):
- '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
- data = dateString.split()
- if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
- del data[0]
- if len(data) == 4:
- s = data[3]
- i = s.find('+')
- if i > 0:
- data[3:] = [s[:i], s[i+1:]]
- else:
- data.append('')
- dateString = " ".join(data)
- if len(data) < 5:
- dateString += ' 00:00:00 GMT'
+# Define the strings used by the RFC822 datetime parser
+_rfc822_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
+ 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
+_rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
+
+# Only the first three letters of the month name matter
+_rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months))
+# The year may be 2 or 4 digits; capture the century if it exists
+_rfc822_year = "(?P(?:\d{2})?\d{2})"
+_rfc822_day = "(?P *\d{1,2})"
+_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year)
+
+_rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?"
+_rfc822_tz = "(?Put|gmt(?:[+-]\d{2}:\d{2})?|[aecmp][sd]?t|[zamny]|[+-]\d{4})"
+_rfc822_tznames = {
+ 'ut': 0, 'gmt': 0, 'z': 0,
+ 'adt': -3, 'ast': -4, 'at': -4,
+ 'edt': -4, 'est': -5, 'et': -5,
+ 'cdt': -5, 'cst': -6, 'ct': -6,
+ 'mdt': -6, 'mst': -7, 'mt': -7,
+ 'pdt': -7, 'pst': -8, 'pt': -8,
+ 'a': -1, 'n': 1,
+ 'm': -12, 'y': 12,
+ }
+# The timezone may be prefixed by 'Etc/'
+_rfc822_time = "%s (?:etc/)?%s" % (_rfc822_hour, _rfc822_tz)
+
+_rfc822_dayname = "(?P%s)" % ('|'.join(_rfc822_daynames))
+_rfc822_match = re.compile(
+ "(?:%s, )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date, _rfc822_time)
+).match
+
+def _parse_date_group_rfc822(m):
+ # Calculate a date and timestamp
+ for k in ('year', 'day', 'hour', 'minute', 'second'):
+ m[k] = int(m[k])
+ m['month'] = _rfc822_months.index(m['month']) + 1
+ # If the year is 2 digits, assume everything in the 90's is the 1990's
+ if m['year'] < 100:
+ m['year'] += (1900, 2000)[m['year'] < 90]
+ stamp = datetime.datetime(*[m[i] for i in
+ ('year', 'month', 'day', 'hour', 'minute', 'second')])
+
+ # Use the timezone information to calculate the difference between
+ # the given date and timestamp and Universal Coordinated Time
+ tzhour = 0
+ tzmin = 0
+ if m['tz'] and m['tz'].startswith('gmt'):
+ # Handle GMT and GMT+hh:mm timezone syntax (the trailing
+ # timezone info will be handled by the next `if` block)
+ m['tz'] = ''.join(m['tz'][3:].split(':')) or 'gmt'
+ if not m['tz']:
+ pass
+ elif m['tz'].startswith('+'):
+ tzhour = int(m['tz'][1:3])
+ tzmin = int(m['tz'][3:])
+ elif m['tz'].startswith('-'):
+ tzhour = int(m['tz'][1:3]) * -1
+ tzmin = int(m['tz'][3:]) * -1
+ else:
+ tzhour = _rfc822_tznames[m['tz']]
+ delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
+
+ # Return the date and timestamp in UTC
+ return (stamp - delta).utctimetuple()
+
+def _parse_date_rfc822(dt):
+ """Parse RFC 822 dates and times, with one minor
+ difference: years may be 4DIGIT or 2DIGIT.
+ http://tools.ietf.org/html/rfc822#section-5"""
+ try:
+ m = _rfc822_match(dt.lower()).groupdict(0)
+ except AttributeError:
+ return None
+
+ return _parse_date_group_rfc822(m)
+registerDateHandler(_parse_date_rfc822)
+
+def _parse_date_rfc822_grubby(dt):
+ """Parse date format similar to RFC 822, but
+ the comma after the dayname is optional and
+ month/day are inverted"""
+ _rfc822_date_grubby = "%s %s %s" % (_rfc822_month, _rfc822_day, _rfc822_year)
+ _rfc822_match_grubby = re.compile(
+ "(?:%s[,]? )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date_grubby, _rfc822_time)
+ ).match
+
+ try:
+ m = _rfc822_match_grubby(dt.lower()).groupdict(0)
+ except AttributeError:
+ return None
+
+ return _parse_date_group_rfc822(m)
+registerDateHandler(_parse_date_rfc822_grubby)
+
+def _parse_date_asctime(dt):
+ """Parse asctime-style dates"""
+ dayname, month, day, remainder = dt.split(None, 3)
+ # Convert month and day into zero-padded integers
+ month = '%02i ' % (_rfc822_months.index(month.lower()) + 1)
+ day = '%02i ' % (int(day),)
+ dt = month + day + remainder
+ return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, )
+registerDateHandler(_parse_date_asctime)
+
+def _parse_date_perforce(aDateString):
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
+ # Fri, 2006/09/15 08:19:53 EDT
+ _my_date_pattern = re.compile( \
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
+
+ m = _my_date_pattern.search(aDateString)
+ if m is None:
+ return None
+ dow, year, month, day, hour, minute, second, tz = m.groups()
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
tm = rfc822.parsedate_tz(dateString)
if tm:
return time.gmtime(rfc822.mktime_tz(tm))
-# rfc822.py defines several time zones, but we define some extra ones.
-# 'ET' is equivalent to 'EST', etc.
-_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
-rfc822._timezones.update(_additional_timezones)
-registerDateHandler(_parse_date_rfc822)
+registerDateHandler(_parse_date_perforce)
def _parse_date(dateString):
'''Parses a variety of date formats into a 9-tuple in GMT'''
+ if not dateString:
+ return None
for handler in _date_handlers:
try:
date9tuple = handler(dateString)
- if not date9tuple: continue
- if len(date9tuple) != 9:
- if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
- raise ValueError
- map(int, date9tuple)
- return date9tuple
- except Exception, e:
- if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
- pass
+ except (KeyError, OverflowError, ValueError):
+ continue
+ if not date9tuple:
+ continue
+ if len(date9tuple) != 9:
+ continue
+ return date9tuple
return None
-def _getCharacterEncoding(http_headers, xml_data):
- '''Get the character encoding of the XML document
+# Each marker represents some of the characters of the opening XML
+# processing instruction ('
+RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
+
+# Capture the value of the XML processing instruction's encoding attribute.
+# Example:
+RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
+
+def convert_to_utf8(http_headers, data):
+ '''Detect and convert the character encoding to UTF-8.
http_headers is a dictionary
- xml_data is a raw string (not Unicode)
-
- This is so much trickier than it sounds, it's not even funny.
- According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
- is application/xml, application/*+xml,
- application/xml-external-parsed-entity, or application/xml-dtd,
- the encoding given in the charset parameter of the HTTP Content-Type
- takes precedence over the encoding given in the XML prefix within the
- document, and defaults to 'utf-8' if neither are specified. But, if
- the HTTP Content-Type is text/xml, text/*+xml, or
- text/xml-external-parsed-entity, the encoding given in the XML prefix
- within the document is ALWAYS IGNORED and only the encoding given in
- the charset parameter of the HTTP Content-Type header should be
- respected, and it defaults to 'us-ascii' if not specified.
+ data is a raw string (not Unicode)'''
- Furthermore, discussion on the atom-syntax mailing list with the
- author of RFC 3023 leads me to the conclusion that any document
- served with a Content-Type of text/* and no charset parameter
- must be treated as us-ascii. (We now do this.) And also that it
- must always be flagged as non-well-formed. (We now do this too.)
-
- If Content-Type is unspecified (input was local file or non-HTTP source)
- or unrecognized (server just got it totally wrong), then go by the
- encoding given in the XML prefix of the document and default to
- 'iso-8859-1' as per the HTTP specification (RFC 2616).
-
- Then, assuming we didn't find a character encoding in the HTTP headers
- (and the HTTP Content-type allowed us to look in the body), we need
- to sniff the first few bytes of the XML data and try to determine
- whether the encoding is ASCII-compatible. Section F of the XML
- specification shows the way here:
- http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ # This is so much trickier than it sounds, it's not even funny.
+ # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ # is application/xml, application/*+xml,
+ # application/xml-external-parsed-entity, or application/xml-dtd,
+ # the encoding given in the charset parameter of the HTTP Content-Type
+ # takes precedence over the encoding given in the XML prefix within the
+ # document, and defaults to 'utf-8' if neither are specified. But, if
+ # the HTTP Content-Type is text/xml, text/*+xml, or
+ # text/xml-external-parsed-entity, the encoding given in the XML prefix
+ # within the document is ALWAYS IGNORED and only the encoding given in
+ # the charset parameter of the HTTP Content-Type header should be
+ # respected, and it defaults to 'us-ascii' if not specified.
- If the sniffed encoding is not ASCII-compatible, we need to make it
- ASCII compatible so that we can sniff further into the XML declaration
- to find the encoding attribute, which will tell us the true encoding.
+ # Furthermore, discussion on the atom-syntax mailing list with the
+ # author of RFC 3023 leads me to the conclusion that any document
+ # served with a Content-Type of text/* and no charset parameter
+ # must be treated as us-ascii. (We now do this.) And also that it
+ # must always be flagged as non-well-formed. (We now do this too.)
- Of course, none of this guarantees that we will be able to parse the
- feed in the declared character encoding (assuming it was declared
- correctly, which many are not). CJKCodecs and iconv_codec help a lot;
- you should definitely install them if you can.
- http://cjkpython.i18n.org/
- '''
+ # If Content-Type is unspecified (input was local file or non-HTTP source)
+ # or unrecognized (server just got it totally wrong), then go by the
+ # encoding given in the XML prefix of the document and default to
+ # 'iso-8859-1' as per the HTTP specification (RFC 2616).
- def _parseHTTPContentType(content_type):
- '''takes HTTP Content-Type header and returns (content type, charset)
-
- If no charset is specified, returns (content type, '')
- If no content type is specified, returns ('', '')
- Both return parameters are guaranteed to be lowercase strings
- '''
- content_type = content_type or ''
- content_type, params = cgi.parse_header(content_type)
- return content_type, params.get('charset', '').replace("'", '')
-
- sniffed_xml_encoding = ''
- xml_encoding = ''
- true_encoding = ''
- http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
- # Must sniff for non-ASCII-compatible character encodings before
- # searching for XML declaration. This heuristic is defined in
- # section F of the XML specification:
+ # Then, assuming we didn't find a character encoding in the HTTP headers
+ # (and the HTTP Content-type allowed us to look in the body), we need
+ # to sniff the first few bytes of the XML data and try to determine
+ # whether the encoding is ASCII-compatible. Section F of the XML
+ # specification shows the way here:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+ # If the sniffed encoding is not ASCII-compatible, we need to make it
+ # ASCII compatible so that we can sniff further into the XML declaration
+ # to find the encoding attribute, which will tell us the true encoding.
+
+ # Of course, none of this guarantees that we will be able to parse the
+ # feed in the declared character encoding (assuming it was declared
+ # correctly, which many are not). iconv_codec can help a lot;
+ # you should definitely install it if you can.
+ # http://cjkpython.i18n.org/
+
+ bom_encoding = ''
+ xml_encoding = ''
+ rfc3023_encoding = ''
+
+ # Look at the first few bytes of the document to guess what
+ # its encoding may be. We only need to decode enough of the
+ # document that we can use an ASCII-compatible regular
+ # expression to search for an XML encoding declaration.
+ # The heuristic follows the XML specification, section F:
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ # Check for BOMs first.
+ if data[:4] == codecs.BOM_UTF32_BE:
+ bom_encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == codecs.BOM_UTF32_LE:
+ bom_encoding = 'utf-32le'
+ data = data[4:]
+ elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
+ bom_encoding = 'utf-16be'
+ data = data[2:]
+ elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
+ bom_encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == codecs.BOM_UTF8:
+ bom_encoding = 'utf-8'
+ data = data[3:]
+ # Check for the characters '= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
- # UTF-16BE with BOM
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x3f\x00':
- # UTF-16LE
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
- # UTF-16LE with BOM
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\x00\x3c':
- # UTF-32BE
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x00\x00':
- # UTF-32LE
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\xfe\xff':
- # UTF-32BE with BOM
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\xff\xfe\x00\x00':
- # UTF-32LE with BOM
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == '\xef\xbb\xbf':
- # UTF-8 with BOM
- sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
- else:
- # ASCII-compatible
- pass
- xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
- except:
+ if bom_encoding:
+ tempdata = data.decode(bom_encoding).encode('utf-8')
+ except (UnicodeDecodeError, LookupError):
+ # feedparser recognizes UTF-32 encodings that aren't
+ # available in Python 2.4 and 2.5, so it's possible to
+ # encounter a LookupError during decoding.
xml_encoding_match = None
+ else:
+ xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
+
if xml_encoding_match:
- xml_encoding = xml_encoding_match.groups()[0].lower()
- if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
- xml_encoding = sniffed_xml_encoding
+ xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
+ # Normalize the xml_encoding if necessary.
+ if bom_encoding and (xml_encoding in (
+ 'u16', 'utf-16', 'utf16', 'utf_16',
+ 'u32', 'utf-32', 'utf32', 'utf_32',
+ 'iso-10646-ucs-2', 'iso-10646-ucs-4',
+ 'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
+ )):
+ xml_encoding = bom_encoding
+
+ # Find the HTTP Content-Type and, hopefully, a character
+ # encoding provided by the server. The Content-Type is used
+ # to choose the "correct" encoding among the BOM encoding,
+ # XML declaration encoding, and HTTP encoding, following the
+ # heuristic defined in RFC 3023.
+ http_content_type = http_headers.get('content-type') or ''
+ http_content_type, params = cgi.parse_header(http_content_type)
+ http_encoding = params.get('charset', '').replace("'", "")
+ if not isinstance(http_encoding, str):
+ http_encoding = http_encoding.decode('utf-8', 'ignore')
+
acceptable_content_type = 0
- application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
+ application_content_types = ('application/xml', 'application/xml-dtd',
+ 'application/xml-external-parsed-entity')
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
if (http_content_type in application_content_types) or \
- (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
+ (http_content_type.startswith('application/') and
+ http_content_type.endswith('+xml')):
acceptable_content_type = 1
- true_encoding = http_encoding or xml_encoding or 'utf-8'
+ rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
elif (http_content_type in text_content_types) or \
- (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
+ (http_content_type.startswith('text/') and
+ http_content_type.endswith('+xml')):
acceptable_content_type = 1
- true_encoding = http_encoding or 'us-ascii'
+ rfc3023_encoding = http_encoding or 'us-ascii'
elif http_content_type.startswith('text/'):
- true_encoding = http_encoding or 'us-ascii'
- elif http_headers and (not http_headers.has_key('content-type')):
- true_encoding = xml_encoding or 'iso-8859-1'
+ rfc3023_encoding = http_encoding or 'us-ascii'
+ elif http_headers and 'content-type' not in http_headers:
+ rfc3023_encoding = xml_encoding or 'iso-8859-1'
else:
- true_encoding = xml_encoding or 'utf-8'
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
-
-def _toUTF8(data, encoding):
- '''Changes an XML data stream on the fly to specify a new encoding
-
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
- encoding is a string recognized by encodings.aliases
- '''
- if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
- # strip Byte Order Mark (if present)
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
- if _debug:
- sys.stderr.write('stripping BOM\n')
- if encoding != 'utf-16be':
- sys.stderr.write('trying utf-16be instead\n')
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
- if _debug:
- sys.stderr.write('stripping BOM\n')
- if encoding != 'utf-16le':
- sys.stderr.write('trying utf-16le instead\n')
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == '\xef\xbb\xbf':
- if _debug:
- sys.stderr.write('stripping BOM\n')
- if encoding != 'utf-8':
- sys.stderr.write('trying utf-8 instead\n')
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == '\x00\x00\xfe\xff':
- if _debug:
- sys.stderr.write('stripping BOM\n')
- if encoding != 'utf-32be':
- sys.stderr.write('trying utf-32be instead\n')
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == '\xff\xfe\x00\x00':
- if _debug:
- sys.stderr.write('stripping BOM\n')
- if encoding != 'utf-32le':
- sys.stderr.write('trying utf-32le instead\n')
- encoding = 'utf-32le'
- data = data[4:]
- newdata = unicode(data, encoding)
- if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
- declmatch = re.compile('^<\?xml[^>]*?>')
- newdecl = ''''''
- if declmatch.search(newdata):
- newdata = declmatch.sub(newdecl, newdata)
- else:
- newdata = newdecl + u'\n' + newdata
- return newdata.encode('utf-8')
-
-def _stripDoctype(data):
- '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
-
- rss_version may be 'rss091n' or None
- stripped_data is the same XML document, minus the DOCTYPE
- '''
- entity_pattern = re.compile(r']*?)>', re.MULTILINE)
- data = entity_pattern.sub('', data)
- doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
- doctype_results = doctype_pattern.findall(data)
- doctype = doctype_results and doctype_results[0] or ''
- if doctype.lower().count('netscape'):
- version = 'rss091n'
- else:
- version = None
- data = doctype_pattern.sub('', data)
- return version, data
-
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
- '''Parse a feed from a URL, file, stream, or string'''
- result = FeedParserDict()
- result['feed'] = FeedParserDict()
- result['entries'] = []
- if _XML_AVAILABLE:
- result['bozo'] = 0
- if type(handlers) == types.InstanceType:
- handlers = [handlers]
- try:
- f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
- data = f.read()
- except Exception, e:
- result['bozo'] = 1
- result['bozo_exception'] = e
- data = ''
- f = None
-
- # if feed is gzip-compressed, decompress it
- if f and data and hasattr(f, 'headers'):
- if gzip and f.headers.get('content-encoding', '') == 'gzip':
- try:
- data = gzip.GzipFile(fileobj=_StringIO(data)).read()
- except Exception, e:
- # Some feeds claim to be gzipped but they're not, so
- # we get garbage. Ideally, we should re-request the
- # feed without the 'Accept-encoding: gzip' header,
- # but we don't.
- result['bozo'] = 1
- result['bozo_exception'] = e
- data = ''
- elif zlib and f.headers.get('content-encoding', '') == 'deflate':
- try:
- data = zlib.decompress(data, -zlib.MAX_WBITS)
- except Exception, e:
- result['bozo'] = 1
- result['bozo_exception'] = e
- data = ''
-
- # save HTTP headers
- if hasattr(f, 'info'):
- info = f.info()
- result['etag'] = info.getheader('ETag')
- last_modified = info.getheader('Last-Modified')
- if last_modified:
- result['modified'] = _parse_date(last_modified)
- if hasattr(f, 'url'):
- result['href'] = f.url
- result['status'] = 200
- if hasattr(f, 'status'):
- result['status'] = f.status
- if hasattr(f, 'headers'):
- result['headers'] = f.headers.dict
- if hasattr(f, 'close'):
- f.close()
+ rfc3023_encoding = xml_encoding or 'utf-8'
+ # gb18030 is a superset of gb2312, so always replace gb2312
+ # with gb18030 for greater compatibility.
+ if rfc3023_encoding.lower() == 'gb2312':
+ rfc3023_encoding = 'gb18030'
+ if xml_encoding.lower() == 'gb2312':
+ xml_encoding = 'gb18030'
# there are four encodings to keep track of:
# - http_encoding is the encoding declared in the Content-Type HTTP header
# - xml_encoding is the encoding declared in the '''
+ if RE_XML_DECLARATION.search(data):
+ data = RE_XML_DECLARATION.sub(new_declaration, data)
+ else:
+ data = new_declaration + '\n' + data
+ data = data.encode('utf-8')
+ break
+ # if still no luck, give up
+ if not known_encoding:
+ error = CharacterEncodingUnknown(
+ 'document encoding unknown, I tried ' +
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
+ (rfc3023_encoding, xml_encoding))
+ rfc3023_encoding = ''
+ elif proposed_encoding != rfc3023_encoding:
+ error = CharacterEncodingOverride(
+ 'document declared as %s, but parsed as %s' %
+ (rfc3023_encoding, proposed_encoding))
+ rfc3023_encoding = proposed_encoding
+
+ return data, rfc3023_encoding, error
+
+# Match XML entity declarations.
+# Example:
+RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+
+# Match XML DOCTYPE declarations.
+# Example:
+RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
+
+# Match safe entity declarations.
+# This will allow hexadecimal character references through,
+# as well as text, but not arbitrary nested entities.
+# Example: cubed "³"
+# Example: copyright "(C)"
+# Forbidden: explode1 "&explode2;&explode2;"
+RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(\w+;|[^&"]*)"'))
+
+def replace_doctype(data):
+ '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
+
+ rss_version may be 'rss091n' or None
+ stripped_data is the same XML document with a replaced DOCTYPE
+ '''
+
+ # Divide the document into two groups by finding the location
+ # of the first element that doesn't begin with '' or '\n\n]>')
+ data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
+
+ # Precompute the safe entities for the loose parser.
+ safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
+ for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
+ return version, data, safe_entities
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+ '''Parse a feed from a URL, file, stream, or string.
+
+ request_headers, if given, is a dict from http header name to value to add
+ to the request; this overrides internally generated values.
+ '''
+
+ if handlers is None:
+ handlers = []
+ if request_headers is None:
+ request_headers = {}
+ if response_headers is None:
+ response_headers = {}
+
+ result = FeedParserDict()
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['bozo'] = 0
+ if not isinstance(handlers, list):
+ handlers = [handlers]
+ try:
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
+ data = f.read()
+ except Exception as e:
result['bozo'] = 1
- result['bozo_exception'] = NonXMLContentType(bozo_message)
-
- result['version'], data = _stripDoctype(data)
+ result['bozo_exception'] = e
+ data = None
+ f = None
- baseuri = http_headers.get('content-location', result.get('href'))
- baselang = http_headers.get('content-language', None)
+ if hasattr(f, 'headers'):
+ result['headers'] = dict(f.headers)
+ # overwrite existing headers using response_headers
+ if 'headers' in result:
+ result['headers'].update(response_headers)
+ elif response_headers:
+ result['headers'] = copy.deepcopy(response_headers)
- # if server sent 304, we're done
- if result.get('status', 0) == 304:
+ # lowercase all of the HTTP headers for comparisons per RFC 2616
+ if 'headers' in result:
+ http_headers = dict((k.lower(), v) for k, v in list(result['headers'].items()))
+ else:
+ http_headers = {}
+
+ # if feed is gzip-compressed, decompress it
+ if f and data and http_headers:
+ if gzip and 'gzip' in http_headers.get('content-encoding', ''):
+ try:
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
+ except (IOError, struct.error) as e:
+ # IOError can occur if the gzip header is bad.
+ # struct.error can occur if the data is damaged.
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ if isinstance(e, struct.error):
+ # A gzip header was found but the data is corrupt.
+ # Ideally, we should re-request the feed without the
+ # 'Accept-encoding: gzip' header, but we don't.
+ data = None
+ elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
+ try:
+ data = zlib.decompress(data)
+ except zlib.error as e:
+ try:
+ # The data may have no headers and no checksum.
+ data = zlib.decompress(data, -15)
+ except zlib.error as e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+
+ # save HTTP headers
+ if http_headers:
+ if 'etag' in http_headers:
+ etag = http_headers.get('etag', '')
+ if not isinstance(etag, str):
+ etag = etag.decode('utf-8', 'ignore')
+ if etag:
+ result['etag'] = etag
+ if 'last-modified' in http_headers:
+ modified = http_headers.get('last-modified', '')
+ if modified:
+ result['modified'] = modified
+ result['modified_parsed'] = _parse_date(modified)
+ if hasattr(f, 'url'):
+ if not isinstance(f.url, str):
+ result['href'] = f.url.decode('utf-8', 'ignore')
+ else:
+ result['href'] = f.url
+ result['status'] = 200
+ if hasattr(f, 'status'):
+ result['status'] = f.status
+ if hasattr(f, 'close'):
+ f.close()
+
+ if data is None:
+ return result
+
+ # Stop processing if the server sent HTTP 304 Not Modified.
+ if getattr(f, 'code', 0) == 304:
result['version'] = ''
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
'so the server sent no data. This is a feature, not a bug!'
return result
- # if there was a problem downloading, we're done
- if not data:
- return result
+ data, result['encoding'], error = convert_to_utf8(http_headers, data)
+ use_strict_parser = result['encoding'] and True or False
+ if error is not None:
+ result['bozo'] = 1
+ result['bozo_exception'] = error
- # determine character encoding
- use_strict_parser = 0
- known_encoding = 0
- tried_encodings = []
- # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
- for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
- if not proposed_encoding: continue
- if proposed_encoding in tried_encodings: continue
- tried_encodings.append(proposed_encoding)
- try:
- data = _toUTF8(data, proposed_encoding)
- known_encoding = use_strict_parser = 1
- break
- except:
- pass
- # if no luck and we have auto-detection library, try that
- if (not known_encoding) and chardet:
- try:
- proposed_encoding = chardet.detect(data)['encoding']
- if proposed_encoding and (proposed_encoding not in tried_encodings):
- tried_encodings.append(proposed_encoding)
- data = _toUTF8(data, proposed_encoding)
- known_encoding = use_strict_parser = 1
- except:
- pass
- # if still no luck and we haven't tried utf-8 yet, try that
- if (not known_encoding) and ('utf-8' not in tried_encodings):
- try:
- proposed_encoding = 'utf-8'
- tried_encodings.append(proposed_encoding)
- data = _toUTF8(data, proposed_encoding)
- known_encoding = use_strict_parser = 1
- except:
- pass
- # if still no luck and we haven't tried windows-1252 yet, try that
- if (not known_encoding) and ('windows-1252' not in tried_encodings):
- try:
- proposed_encoding = 'windows-1252'
- tried_encodings.append(proposed_encoding)
- data = _toUTF8(data, proposed_encoding)
- known_encoding = use_strict_parser = 1
- except:
- pass
- # if still no luck, give up
- if not known_encoding:
- result['bozo'] = 1
- result['bozo_exception'] = CharacterEncodingUnknown( \
- 'document encoding unknown, I tried ' + \
- '%s, %s, utf-8, and windows-1252 but nothing worked' % \
- (result['encoding'], xml_encoding))
- result['encoding'] = ''
- elif proposed_encoding != result['encoding']:
- result['bozo'] = 1
- result['bozo_exception'] = CharacterEncodingOverride( \
- 'documented declared as %s, but parsed as %s' % \
- (result['encoding'], proposed_encoding))
- result['encoding'] = proposed_encoding
+ result['version'], data, entities = replace_doctype(data)
+
+ # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
+ contentloc = http_headers.get('content-location', '')
+ href = result.get('href', '')
+ baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
+
+ baselang = http_headers.get('content-language', None)
+ if not isinstance(baselang, str) and baselang is not None:
+ baselang = baselang.decode('utf-8', 'ignore')
if not _XML_AVAILABLE:
use_strict_parser = 0
@@ -2599,260 +3987,26 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
+ try:
+ # disable downloading external doctype references, if possible
+ saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
+ except xml.sax.SAXNotSupportedException:
+ pass
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
source = xml.sax.xmlreader.InputSource()
source.setByteStream(_StringIO(data))
- if hasattr(saxparser, '_ns_stack'):
- # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
- # PyXML doesn't have this problem, and it doesn't have _ns_stack either
- saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
try:
saxparser.parse(source)
- except Exception, e:
- if _debug:
- import traceback
- traceback.print_stack()
- traceback.print_exc()
- sys.stderr.write('xml parsing failed\n')
+ except xml.sax.SAXException as e:
result['bozo'] = 1
result['bozo_exception'] = feedparser.exc or e
use_strict_parser = 0
- if not use_strict_parser:
- feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
- feedparser.feed(data)
+ if not use_strict_parser and _SGML_AVAILABLE:
+ feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+ feedparser.feed(data.decode('utf-8', 'replace'))
result['feed'] = feedparser.feeddata
result['entries'] = feedparser.entries
result['version'] = result['version'] or feedparser.version
result['namespaces'] = feedparser.namespacesInUse
return result
-
-if __name__ == '__main__':
- if not sys.argv[1:]:
- print __doc__
- sys.exit(0)
- else:
- urls = sys.argv[1:]
- zopeCompatibilityHack()
- from pprint import pprint
- for url in urls:
- print url
- print
- result = parse(url)
- pprint(result)
- print
-
-#REVISION HISTORY
-#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
-# added Simon Fell's test suite
-#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
-#2.0 - 10/19/2002
-# JD - use inchannel to watch out for image and textinput elements which can
-# also contain title, link, and description elements
-# JD - check for isPermaLink='false' attribute on guid elements
-# JD - replaced openAnything with open_resource supporting ETag and
-# If-Modified-Since request headers
-# JD - parse now accepts etag, modified, agent, and referrer optional
-# arguments
-# JD - modified parse to return a dictionary instead of a tuple so that any
-# etag or modified information can be returned and cached by the caller
-#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
-# because of etag/modified, return the old etag/modified to the caller to
-# indicate why nothing is being returned
-#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
-# useless. Fixes the problem JD was addressing by adding it.
-#2.1 - 11/14/2002 - MAP - added gzip support
-#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
-# start_admingeneratoragent is an example of how to handle elements with
-# only attributes, no content.
-#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
-# also, make sure we send the User-Agent even if urllib2 isn't available.
-# Match any variation of backend.userland.com/rss namespace.
-#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
-#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
-# snapshot of July 1 ; changed
-# project name
-#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
-# removed unnecessary urllib code -- urllib2 should always be available anyway;
-# return actual url, status, and full HTTP headers (as result['url'],
-# result['status'], and result['headers']) if parsing a remote feed over HTTP --
-# this should pass all the HTTP tests at ;
-# added the latest namespace-of-the-week for RSS 2.0
-#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
-# User-Agent (otherwise urllib2 sends two, which confuses some servers)
-#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
-# inline and as used in some RSS 2.0 feeds
-#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
-# textInput, and also to return the character encoding (if specified)
-#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
-# nested divs within content (JohnD); fixed missing sys import (JohanS);
-# fixed regular expression to capture XML character encoding (Andrei);
-# added support for Atom 0.3-style links; fixed bug with textInput tracking;
-# added support for cloud (MartijnP); added support for multiple
-# category/dc:subject (MartijnP); normalize content model: 'description' gets
-# description (which can come from description, summary, or full content if no
-# description), 'content' gets dict of base/language/type/value (which can come
-# from content:encoded, xhtml:body, content, or fullitem);
-# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
-# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
-# element is not in default namespace (like Pocketsoap feed);
-# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
-# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
-# description, xhtml:body, content, content:encoded, title, subtitle,
-# summary, info, tagline, and copyright; added support for pingback and
-# trackback namespaces
-#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
-# namespaces, as opposed to 2.6 when I said I did but didn't really;
-# sanitize HTML markup within some elements; added mxTidy support (if
-# installed) to tidy HTML markup within some elements; fixed indentation
-# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
-# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
-# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
-# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
-# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
-#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
-# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
-# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
-#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in
-# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
-# fixed relative URI processing for guid (skadz); added ICBM support; added
-# base64 support
-#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
-# blogspot.com sites); added _debug variable
-#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
-#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
-# added several new supported namespaces; fixed bug tracking naked markup in
-# description; added support for enclosure; added support for source; re-added
-# support for cloud which got dropped somehow; added support for expirationDate
-#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
-# xml:base URI, one for documents that don't define one explicitly and one for
-# documents that define an outer and an inner xml:base that goes out of scope
-# before the end of the document
-#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
-#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
-# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
-# added support for creativeCommons:license and cc:license; added support for
-# full Atom content model in title, tagline, info, copyright, summary; fixed bug
-# with gzip encoding (not always telling server we support it when we do)
-#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
-# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
-# contains name + email address
-#3.0b8 - 1/28/2004 - MAP - added support for contributor
-#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
-# support for summary
-#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
-# xml.util.iso8601
-#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
-# dangerous markup; fiddled with decodeEntities (not right); liberalized
-# date parsing even further
-#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
-# added support to Atom 0.2 subtitle; added support for Atom content model
-# in copyright; better sanitizing of dangerous HTML elements with end tags
-# (script, frameset)
-#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
-# etc.) in embedded markup, in either HTML or XHTML form (
,
,
)
-#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
-# Python 2.1
-#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
-# fixed bug capturing author and contributor URL; fixed bug resolving relative
-# links in author and contributor URL; fixed bug resolvin relative links in
-# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
-# namespace tests, and included them permanently in the test suite with his
-# permission; fixed namespace handling under Python 2.1
-#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
-#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
-#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
-# use libxml2 (if available)
-#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
-# name was in parentheses; removed ultra-problematic mxTidy support; patch to
-# workaround crash in PyXML/expat when encountering invalid entities
-# (MarkMoraes); support for textinput/textInput
-#3.0b20 - 4/7/2004 - MAP - added CDF support
-#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
-#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
-# results dict; changed results dict to allow getting values with results.key
-# as well as results[key]; work around embedded illformed HTML with half
-# a DOCTYPE; work around malformed Content-Type header; if character encoding
-# is wrong, try several common ones before falling back to regexes (if this
-# works, bozo_exception is set to CharacterEncodingOverride); fixed character
-# encoding issues in BaseHTMLProcessor by tracking encoding and converting
-# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
-# convert each value in results to Unicode (if possible), even if using
-# regex-based parsing
-#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
-# high-bit characters in attributes in embedded HTML in description (thanks
-# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
-# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
-# about a mapped key
-#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
-# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
-# cause the same encoding to be tried twice (even if it failed the first time);
-# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
-# better textinput and image tracking in illformed RSS 1.0 feeds
-#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
-# my blink tag tests
-#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
-# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
-# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
-# added support for image; refactored parse() fallback logic to try other
-# encodings if SAX parsing fails (previously it would only try other encodings
-# if re-encoding failed); remove unichr madness in normalize_attrs now that
-# we're properly tracking encoding in and out of BaseHTMLProcessor; set
-# feed.language from root-level xml:lang; set entry.id from rdf:about;
-# send Accept header
-#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
-# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
-# windows-1252); fixed regression that could cause the same encoding to be
-# tried twice (even if it failed the first time)
-#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
-# recover from malformed content-type header parameter with no equals sign
-# ('text/xml; charset:iso-8859-1')
-#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
-# to Unicode equivalents in illformed feeds (aaronsw); added and
-# passed tests for converting character entities to Unicode equivalents
-# in illformed feeds (aaronsw); test for valid parsers when setting
-# XML_AVAILABLE; make version and encoding available when server returns
-# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
-# digest auth or proxy support); add code to parse username/password
-# out of url and send as basic authentication; expose downloading-related
-# exceptions in bozo_exception (aaronsw); added __contains__ method to
-# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
-#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
-# convert feed to UTF-8 before passing to XML parser; completely revamped
-# logic for determining character encoding and attempting XML parsing
-# (much faster); increased default timeout to 20 seconds; test for presence
-# of Location header on redirects; added tests for many alternate character
-# encodings; support various EBCDIC encodings; support UTF-16BE and
-# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
-# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
-# XML parsers are available; added support for 'Content-encoding: deflate';
-# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
-# are available
-#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
-# problem tracking xml:base and xml:lang if element declares it, child
-# doesn't, first grandchild redeclares it, and second grandchild doesn't;
-# refactored date parsing; defined public registerDateHandler so callers
-# can add support for additional date formats at runtime; added support
-# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
-# zopeCompatibilityHack() which turns FeedParserDict into a regular
-# dictionary, required for Zope compatibility, and also makes command-
-# line debugging easier because pprint module formats real dictionaries
-# better than dictionary-like objects; added NonXMLContentType exception,
-# which is stored in bozo_exception when a feed is served with a non-XML
-# media type such as 'text/plain'; respect Content-Language as default
-# language if not xml:lang is present; cloud dict is now FeedParserDict;
-# generator dict is now FeedParserDict; better tracking of xml:lang,
-# including support for xml:lang='' to unset the current language;
-# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
-# namespace; don't overwrite final status on redirects (scenarios:
-# redirecting to a URL that returns 304, redirecting to a URL that
-# redirects to another URL with a different type of redirect); add
-# support for HTTP 303 redirects
-#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
-# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
-# support for Atom 1.0; support for iTunes extensions; new 'tags' for
-# categories/keywords/etc. as array of dict
-# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
-# terminology; parse RFC 822-style dates with no time; lots of other
-# bug fixes
-#4.1 - MAP - removed socket timeout; added support for chardet library
diff --git a/libs/magic.py b/libs/magic.py
index 6c30a0c..10685ac 100644
--- a/libs/magic.py
+++ b/libs/magic.py
@@ -1,7 +1,4 @@
"""
-Adam Hupp (adam@hupp.org)
-http://github.com/ahupp/python-magic
-
magic is a wrapper around the libmagic file identification library.
See README for more information.
@@ -20,9 +17,12 @@ Usage:
"""
+import sys
+import glob
import os.path
import ctypes
import ctypes.util
+import threading
from ctypes import c_char_p, c_int, c_size_t, c_void_p
@@ -34,74 +34,112 @@ class Magic:
"""
- def __init__(self, mime=False, magic_file=None, mime_encoding=False):
+ def __init__(self, mime=False, magic_file=None, mime_encoding=False,
+ keep_going=False):
"""
Create a new libmagic wrapper.
mime - if True, mimetypes are returned instead of textual descriptions
mime_encoding - if True, codec is returned
magic_file - use a mime database other than the system default
-
+ keep_going - don't stop at the first match, keep going
"""
- flags = MAGIC_NONE
+ self.flags = MAGIC_NONE
if mime:
- flags |= MAGIC_MIME
+ self.flags |= MAGIC_MIME
elif mime_encoding:
- flags |= MAGIC_MIME_ENCODING
+ self.flags |= MAGIC_MIME_ENCODING
+ if keep_going:
+ self.flags |= MAGIC_CONTINUE
- self.cookie = magic_open(flags)
+ self.cookie = magic_open(self.flags)
magic_load(self.cookie, magic_file)
+ self.thread = threading.currentThread()
def from_buffer(self, buf):
"""
Identify the contents of `buf`
"""
- return magic_buffer(self.cookie, buf)
+ self._thread_check()
+ try:
+ return magic_buffer(self.cookie, buf)
+ except MagicException as e:
+ return self._handle509Bug(e)
def from_file(self, filename):
"""
Identify the contents of file `filename`
raises IOError if the file does not exist
"""
-
+ self._thread_check()
if not os.path.exists(filename):
raise IOError("File does not exist: " + filename)
+ try:
+ return magic_file(self.cookie, filename)
+ except MagicException as e:
+ return self._handle509Bug(e)
- return magic_file(self.cookie, filename)
+ def _handle509Bug(self, e):
+ # libmagic 5.09 has a bug where it might mail to identify the
+ # mimetype of a file and returns null from magic_file (and
+ # likely _buffer), but also does not return an error message.
+ if e.message is None and (self.flags & MAGIC_MIME):
+ return "application/octet-stream"
+
+ def _thread_check(self):
+ if self.thread != threading.currentThread():
+ raise Exception('attempting to use libmagic on multiple threads will '
+ 'end in SEGV. Prefer to use the module functions '
+ 'from_file or from_buffer, or carefully manage direct '
+ 'use of the Magic class')
def __del__(self):
- if self.cookie:
+ # no _thread_check here because there can be no other
+ # references to this object at this point.
+
+ # during shutdown magic_close may have been cleared already so
+ # make sure it exists before using it.
+
+ # the self.cookie check should be unnessary and was an
+ # incorrect fix for a threading problem, however I'm leaving
+ # it in because it's harmless and I'm slightly afraid to
+ # remove it.
+ if self.cookie and magic_close:
magic_close(self.cookie)
self.cookie = None
-_magic_mime = None
-_magic = None
-def _get_magic_mime():
- global _magic_mime
- if not _magic_mime:
- _magic_mime = Magic(mime=True)
- return _magic_mime
-
-def _get_magic():
- global _magic
- if not _magic:
- _magic = Magic()
- return _magic
+instances = threading.local()
def _get_magic_type(mime):
- if mime:
- return _get_magic_mime()
- else:
- return _get_magic()
+ i = instances.__dict__.get(mime)
+ if i is None:
+ i = instances.__dict__[mime] = Magic(mime=mime)
+ return i
def from_file(filename, mime=False):
+ """"
+ Accepts a filename and returns the detected filetype. Return
+ value is the mimetype if mime=True, otherwise a human readable
+ name.
+
+ >>> magic.from_file("testdata/test.pdf", mime=True)
+ 'application/pdf'
+ """
m = _get_magic_type(mime)
return m.from_file(filename)
def from_buffer(buffer, mime=False):
+ """
+ Accepts a binary string and returns the detected filetype. Return
+ value is the mimetype if mime=True, otherwise a human readable
+ name.
+
+ >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
+ 'PDF document, version 1.2'
+ """
m = _get_magic_type(mime)
return m.from_buffer(buffer)
@@ -110,19 +148,22 @@ def from_buffer(buffer, mime=False):
libmagic = None
# Let's try to find magic or magic1
-dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1')
+dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') or ctypes.util.find_library('cygmagic-1')
# This is necessary because find_library returns None if it doesn't find the library
if dll:
libmagic = ctypes.CDLL(dll)
if not libmagic or not libmagic._name:
- import sys
- platform_to_lib = {'darwin': '/opt/local/lib/libmagic.dylib',
- 'win32': 'magic1.dll'}
- if sys.platform in platform_to_lib:
+ platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
+ '/usr/local/lib/libmagic.dylib'] +
+ # Assumes there will only be one version installed
+ glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'),
+ 'win32': ['magic1.dll','cygmagic-1.dll']}
+ for dll in platform_to_lib.get(sys.platform, []):
try:
- libmagic = ctypes.CDLL(platform_to_lib[sys.platform])
+ libmagic = ctypes.CDLL(dll)
+ break
except OSError:
pass
@@ -132,13 +173,38 @@ if not libmagic or not libmagic._name:
magic_t = ctypes.c_void_p
-def errorcheck(result, func, args):
- err = magic_error(args[0])
- if err is not None:
+def errorcheck_null(result, func, args):
+ if result is None:
+ err = magic_error(args[0])
raise MagicException(err)
else:
return result
+def errorcheck_negative_one(result, func, args):
+ if result is -1:
+ err = magic_error(args[0])
+ raise MagicException(err)
+ else:
+ return result
+
+
+def coerce_filename(filename):
+ if filename is None:
+ return None
+
+ # ctypes will implicitly convert unicode strings to bytes with
+ # .encode('ascii'). If you use the filesystem encoding
+ # then you'll get inconsistent behavior (crashes) depending on the user's
+ # LANG environment variable
+ is_unicode = (sys.version_info[0] <= 2 and
+ isinstance(filename, unicode)) or \
+ (sys.version_info[0] >= 3 and
+ isinstance(filename, str))
+ if is_unicode:
+ return filename.encode('utf-8')
+ else:
+ return filename
+
magic_open = libmagic.magic_open
magic_open.restype = magic_t
magic_open.argtypes = [c_int]
@@ -155,26 +221,30 @@ magic_errno = libmagic.magic_errno
magic_errno.restype = c_int
magic_errno.argtypes = [magic_t]
-magic_file = libmagic.magic_file
-magic_file.restype = c_char_p
-magic_file.argtypes = [magic_t, c_char_p]
-magic_file.errcheck = errorcheck
+_magic_file = libmagic.magic_file
+_magic_file.restype = c_char_p
+_magic_file.argtypes = [magic_t, c_char_p]
+_magic_file.errcheck = errorcheck_null
+def magic_file(cookie, filename):
+ return _magic_file(cookie, coerce_filename(filename))
_magic_buffer = libmagic.magic_buffer
_magic_buffer.restype = c_char_p
_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
-_magic_buffer.errcheck = errorcheck
-
+_magic_buffer.errcheck = errorcheck_null
def magic_buffer(cookie, buf):
return _magic_buffer(cookie, buf, len(buf))
-magic_load = libmagic.magic_load
-magic_load.restype = c_int
-magic_load.argtypes = [magic_t, c_char_p]
-magic_load.errcheck = errorcheck
+_magic_load = libmagic.magic_load
+_magic_load.restype = c_int
+_magic_load.argtypes = [magic_t, c_char_p]
+_magic_load.errcheck = errorcheck_negative_one
+
+def magic_load(cookie, filename):
+ return _magic_load(cookie, coerce_filename(filename))
magic_setflags = libmagic.magic_setflags
magic_setflags.restype = c_int
diff --git a/libs/pytwmn.py b/libs/pytwmn.py
index 49661fb..6b2d774 100644
--- a/libs/pytwmn.py
+++ b/libs/pytwmn.py
@@ -45,8 +45,8 @@ def init(host="127.0.0.1", port=None):
class Notification(object):
def __init__(self, title="", msg="", icon=""):
- self.title = unicode(title)
- self.msg = unicode(msg)
+ self.title = str(title)
+ self.msg = str(msg)
if icon.startswith("file://"):
icon = icon[7:]
self.icon = icon
diff --git a/libs/sgmllib.py b/libs/sgmllib.py
new file mode 100644
index 0000000..88a02a3
--- /dev/null
+++ b/libs/sgmllib.py
@@ -0,0 +1,547 @@
+"""A parser for SGML, using the derived class as a static DTD."""
+
+# XXX This only supports those SGML features used by HTML.
+
+# XXX There should be a way to distinguish between PCDATA (parsed
+# character data -- the normal case), RCDATA (replaceable character
+# data -- only char and entity references and end tags are special)
+# and CDATA (character data -- only end tags are special). RCDATA is
+# not supported at all.
+
+import _markupbase
+import re
+
+__all__ = ["SGMLParser", "SGMLParseError"]
+
+# Regular expressions used for parsing
+
+interesting = re.compile('[&<]')
+incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
+ '<([a-zA-Z][^<>]*|'
+ '/([a-zA-Z][^<>]*)?|'
+ '![^<>]*)?')
+
+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+charref = re.compile('([0-9]+)[^0-9]')
+
+starttagopen = re.compile('<[>a-zA-Z]')
+shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
+shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
+piclose = re.compile('>')
+endbracket = re.compile('[<>]')
+tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
+attrfind = re.compile(
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+
+
+class SGMLParseError(RuntimeError):
+ """Exception raised for all parse errors."""
+ pass
+
+
+# SGML parser base class -- find tags and call handler functions.
+# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
+# The dtd is defined by deriving a class which defines methods
+# with special names to handle tags: start_foo and end_foo to handle
+# and , respectively, or do_foo to handle by itself.
+# (Tags are converted to lower case for this purpose.) The data
+# between tags is passed to the parser by calling self.handle_data()
+# with some data as argument (the data may be split up in arbitrary
+# chunks). Entity references are passed by calling
+# self.handle_entityref() with the entity reference as argument.
+
+class SGMLParser(_markupbase.ParserBase):
+ # Definition of entities -- derived classes may override
+ entity_or_charref = re.compile('&(?:'
+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+ ')(;?)')
+
+ def __init__(self, verbose=0):
+ """Initialize and reset this instance."""
+ self.verbose = verbose
+ self.reset()
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.__starttag_text = None
+ self.rawdata = ''
+ self.stack = []
+ self.lasttag = '???'
+ self.nomoretags = 0
+ self.literal = 0
+ _markupbase.ParserBase.reset(self)
+
+ def setnomoretags(self):
+ """Enter literal mode (CDATA) till EOF.
+
+ Intended for derived classes only.
+ """
+ self.nomoretags = self.literal = 1
+
+ def setliteral(self, *args):
+ """Enter literal mode (CDATA).
+
+ Intended for derived classes only.
+ """
+ self.literal = 1
+
+ def feed(self, data):
+ """Feed some data to the parser.
+
+ Call this as often as you want, with as little or as much text
+ as you want (may include '\n'). (This just saves the text,
+ all the processing is done by goahead().)
+ """
+
+ self.rawdata = self.rawdata + data
+ self.goahead(0)
+
+ def close(self):
+ """Handle the remaining data."""
+ self.goahead(1)
+
+ def error(self, message):
+ raise SGMLParseError(message)
+
+ # Internal -- handle data as far as reasonable. May leave state
+ # and data to be processed by a subsequent call. If 'end' is
+ # true, force handling all data as if followed by EOF marker.
+ def goahead(self, end):
+ rawdata = self.rawdata
+ i = 0
+ n = len(rawdata)
+ while i < n:
+ if self.nomoretags:
+ self.handle_data(rawdata[i:n])
+ i = n
+ break
+ match = interesting.search(rawdata, i)
+ if match: j = match.start()
+ else: j = n
+ if i < j:
+ self.handle_data(rawdata[i:j])
+ i = j
+ if i == n: break
+ if rawdata[i] == '<':
+ if starttagopen.match(rawdata, i):
+ if self.literal:
+ self.handle_data(rawdata[i])
+ i = i+1
+ continue
+ k = self.parse_starttag(i)
+ if k < 0: break
+ i = k
+ continue
+ if rawdata.startswith("", i):
+ k = self.parse_endtag(i)
+ if k < 0: break
+ i = k
+ self.literal = 0
+ continue
+ if self.literal:
+ if n > (i + 1):
+ self.handle_data("<")
+ i = i+1
+ else:
+ # incomplete
+ break
+ continue
+ if rawdata.startswith(" send "%s"' % msg)
try:
self.socket.send(msg + bytes("\r\n", "ascii"))
- except socket.error, se:
+ except socket.error as se:
try: # a little dance of compatibility to get the errno
errno = se.errno
except AttributeError:
@@ -160,12 +161,12 @@ class IRCClient:
while not self._end:
try:
buffer += self.socket.recv(1024)
- except socket.timeout, e:
+ except socket.timeout as e:
if self._end:
break
logging.debug("timeout in client.py")
raise e
- except socket.error, e:
+ except socket.error as e:
if self._end:
break
logging.debug("error %s" % e)
@@ -195,16 +196,16 @@ class IRCClient:
pass
yield True
- except socket.timeout, se:
+ except socket.timeout as se:
logging.debug("passing timeout")
raise se
- except socket.error, se:
+ except socket.error as se:
logging.debug("problem: %s" % (se))
if self.socket:
logging.info('error: closing socket')
self.socket.close()
raise se
- except Exception, e:
+ except Exception as e:
logging.debug("other exception: %s" % e)
raise e
else:
@@ -253,7 +254,7 @@ class IRCApp:
garuntee the callback will be called after seconds has passed.
( the only advantage to these timers is they dont use threads )
"""
- assert callable(cb)
+ assert isinstance(cb, collections.Callable)
logging.info('added timer to call %s in %ss' % (cb, seconds))
self._timers.append((time.time() + seconds, cb))
@@ -264,13 +265,13 @@ class IRCApp:
while self.running:
found_one_alive = False
- for client, clientdesc in self._clients.iteritems():
+ for client, clientdesc in self._clients.items():
if clientdesc.con is None:
clientdesc.con = client.connect()
try:
- clientdesc.con.next()
- except Exception, e:
+ next(clientdesc.con)
+ except Exception as e:
logging.error('client error %s' % e)
logging.error(traceback.format_exc())
if clientdesc.autoreconnect:
diff --git a/oyoyo/cmdhandler.py b/oyoyo/cmdhandler.py
index 778020e..a7a8a86 100644
--- a/oyoyo/cmdhandler.py
+++ b/oyoyo/cmdhandler.py
@@ -65,13 +65,17 @@ class CommandHandler(object):
its possible to pass both "command.sub.func" and
["command", "sub", "func"].
"""
- if isinstance(in_command_parts, (str, bytes)):
+ if isinstance(in_command_parts, (bytes)):
in_command_parts = in_command_parts.split(bytes('.', 'ascii'))
+ elif isinstance(in_command_parts, (str)):
+ in_command_parts = in_command_parts.split('.')
command_parts = in_command_parts[:]
p = self
while command_parts:
- cmd = command_parts.pop(0).decode('ascii')
+ cmd = command_parts.pop(0)
+ if type(cmd) is bytes:
+ cmd = cmd.decode('utf-8')
if cmd.startswith('_'):
raise ProtectedCommandError(in_command_parts)
@@ -105,7 +109,7 @@ class CommandHandler(object):
try:
f(*args)
- except Exception, e:
+ except Exception as e:
logging.error('command raised %s' % e)
logging.error(traceback.format_exc())
raise CommandError(command)
@@ -151,7 +155,7 @@ class DefaultBotCommandHandler(CommandHandler):
else:
try:
f = self.get(arg)
- except CommandError, e:
+ except CommandError as e:
helpers.msg(self.client, dest, str(e))
return
@@ -198,7 +202,7 @@ class BotCommandHandler(DefaultCommandHandler):
try:
self.command_handler.run(command, prefix, dest, *arg)
- except CommandError, e:
+ except CommandError as e:
helpers.msg(self.client, dest, str(e))
return True
diff --git a/oyoyo/examplebot.py b/oyoyo/examplebot.py
index 81aac02..dfd1885 100644
--- a/oyoyo/examplebot.py
+++ b/oyoyo/examplebot.py
@@ -21,7 +21,7 @@ class MyHandler(DefaultCommandHandler):
match = re.match('\!say (.*)', msg)
if match:
to_say = match.group(1).strip()
- print('Saying, "%s"' % to_say)
+ print(('Saying, "%s"' % to_say))
helpers.msg(self.client, chan, to_say)
@@ -37,7 +37,7 @@ def main():
conn = cli.connect()
while True:
- conn.next() ## python 2
+ next(conn) ## python 2
# next(conn) ## python 3
diff --git a/oyoyo/helpers.py b/oyoyo/helpers.py
index c82ec9c..5c25b59 100644
--- a/oyoyo/helpers.py
+++ b/oyoyo/helpers.py
@@ -111,7 +111,7 @@ def _addNumerics():
cli.send(cmd_num, *args)
return f
m = sys.modules[__name__]
- for num, name in ircevents.numeric_events.iteritems():
+ for num, name in ircevents.numeric_events.items():
setattr(m, name, numericcmd(num, name))
_addNumerics()
diff --git a/oyoyo/ircevents.py b/oyoyo/ircevents.py
index 6d8969b..a1bda3c 100644
--- a/oyoyo/ircevents.py
+++ b/oyoyo/ircevents.py
@@ -179,6 +179,8 @@ numeric_events = {
"502": "usersdontmatch",
}
+numeric_events = {bytes(k, 'ascii'):v for k, v in numeric_events.items()}
+
generated_events = [
# Generated events
"dcc_connect",
@@ -206,5 +208,5 @@ protocol_events = [
"pong",
]
-all_events = generated_events + protocol_events + numeric_events.values()
+all_events = generated_events + protocol_events + list(numeric_events.values())
diff --git a/oyoyo/services.py b/oyoyo/services.py
index 9183beb..751a787 100644
--- a/oyoyo/services.py
+++ b/oyoyo/services.py
@@ -1,5 +1,5 @@
import sys
-from helpers import msg
+from .helpers import msg
# NickServ basic functions
_nickservfuncs = (
@@ -103,7 +103,7 @@ def _addServ(serv, funcs, prefix=""):
if prefix:
cmd_name = prefix.upper() + " " + cmd_name
def f(cli, *args):
- print cmd_name, " ".join(args)
+ print(cmd_name, " ".join(args))
#cli.send(cmd_name, serv.name, *args)
return f
for t in funcs:
diff --git a/parsetools.py b/parsetools.py
index 4abceed..c834020 100644
--- a/parsetools.py
+++ b/parsetools.py
@@ -29,7 +29,7 @@ quirkloader = ScriptQuirks()
quirkloader.add(PythonQuirks())
quirkloader.add(LuaQuirks())
quirkloader.loadAll()
-print quirkloader.funcre()
+print(quirkloader.funcre())
_functionre = re.compile(r"%s" % quirkloader.funcre())
_groupre = re.compile(r"\\([0-9]+)")
@@ -44,7 +44,7 @@ def lexer(string, objlist):
for (oType, regexp) in objlist:
newstringlist = []
for (stri, s) in enumerate(stringlist):
- if type(s) not in [str, unicode]:
+ if type(s) not in [str]:
newstringlist.append(s)
continue
lasti = 0
@@ -207,9 +207,9 @@ def lexMessage(string):
(smiley, _smilere),
(honker, _honk)]
- string = unicode(string)
+ string = str(string)
string = string.replace("\n", " ").replace("\r", " ")
- lexed = lexer(unicode(string), lexlist)
+ lexed = lexer(str(string), lexlist)
balanced = []
beginc = 0
@@ -231,7 +231,7 @@ def lexMessage(string):
balanced.append(colorEnd(""))
if len(balanced) == 0:
balanced.append("")
- if type(balanced[len(balanced)-1]) not in [str, unicode]:
+ if type(balanced[len(balanced)-1]) not in [str]:
balanced.append("")
return balanced
@@ -239,12 +239,12 @@ def convertTags(lexed, format="html"):
if format not in ["html", "bbcode", "ctag", "text"]:
raise ValueError("Color format not recognized")
- if type(lexed) in [str, unicode]:
+ if type(lexed) in [str]:
lexed = lexMessage(lexed)
escaped = ""
firststr = True
for (i, o) in enumerate(lexed):
- if type(o) in [str, unicode]:
+ if type(o) in [str]:
if format == "html":
escaped += o.replace("&", "&").replace(">", ">").replace("<","<")
else:
@@ -259,7 +259,7 @@ def splitMessage(msg, format="ctag"):
# split long text lines
buf = []
for o in msg:
- if type(o) in [str, unicode] and len(o) > 200:
+ if type(o) in [str] and len(o) > 200:
for i in range(0, len(o), 200):
buf.append(o[i:i+200])
else:
@@ -401,7 +401,7 @@ def parseRegexpFunctions(to):
backr = _groupre.search(mo.group())
if backr is not None:
current.append(backreference(backr.group(1)))
- elif mo.group()[:-1] in functiondict.keys():
+ elif mo.group()[:-1] in list(functiondict.keys()):
p = parseLeaf(functiondict[mo.group()[:-1]], current)
current.append(p)
current = p
@@ -418,7 +418,7 @@ def parseRegexpFunctions(to):
def img2smiley(string):
- string = unicode(string)
+ string = str(string)
def imagerep(mo):
return reverse_smiley[mo.group(1)]
string = re.sub(r'
', imagerep, string)
@@ -499,8 +499,8 @@ if ostools.isOSXBundle():
-reverse_smiley = dict((v,k) for k, v in smiledict.iteritems())
-_smilere = re.compile("|".join(smiledict.keys()))
+reverse_smiley = dict((v,k) for k, v in smiledict.items())
+_smilere = re.compile("|".join(list(smiledict.keys())))
class ThemeException(Exception):
def __init__(self, value):
diff --git a/pesterchum.py b/pesterchum.py
index 7bff675..9c960b8 100644
--- a/pesterchum.py
+++ b/pesterchum.py
@@ -9,28 +9,28 @@ from datetime import *
import random
import re
from time import time
-import threading, Queue
+import threading, queue
reqmissing = []
optmissing = []
try:
from PyQt5 import QtGui, QtCore, QtWidgets
-except ImportError, e:
+except ImportError as e:
module = str(e)
if module.startswith("No module named ") or \
module.startswith("cannot import name "):
reqmissing.append(module[module.rfind(" ")+1:])
- else: print e
+ else: print(e)
try:
import pygame
-except ImportError, e:
+except ImportError as e:
pygame = None
module = str(e)
if module[:16] == "No module named ": optmissing.append(module[16:])
- else: print e
+ else: print(e)
if reqmissing:
- print "ERROR: The following modules are required for Pesterchum to run and are missing on your system:"
- for m in reqmissing: print "* "+m
+ print("ERROR: The following modules are required for Pesterchum to run and are missing on your system:")
+ for m in reqmissing: print("* "+m)
exit()
vnum = QtCore.qVersion()
major = int(vnum[:vnum.find(".")])
@@ -39,8 +39,8 @@ if vnum.find(".", vnum.find(".")+1) != -1:
else:
minor = int(vnum[vnum.find(".")+1:])
if not ((major > 4) or (major == 4 and minor >= 6)):
- print "ERROR: Pesterchum requires Qt version >= 4.6"
- print "You currently have version " + vnum + ". Please upgrade Qt"
+ print("ERROR: Pesterchum requires Qt version >= 4.6")
+ print("You currently have version " + vnum + ". Please upgrade Qt")
exit()
import ostools
@@ -113,7 +113,7 @@ class waitingMessageHolder(object):
def __init__(self, mainwindow, **msgfuncs):
self.mainwindow = mainwindow
self.funcs = msgfuncs
- self.queue = msgfuncs.keys()
+ self.queue = list(msgfuncs.keys())
if len(self.queue) > 0:
self.mainwindow.updateSystemTray()
def waitingHandles(self):
@@ -129,7 +129,7 @@ class waitingMessageHolder(object):
if len(self.queue) == 0:
self.mainwindow.updateSystemTray()
def addMessage(self, handle, func):
- if not self.funcs.has_key(handle):
+ if handle not in self.funcs:
self.queue.append(handle)
self.funcs[handle] = func
if len(self.queue) > 0:
@@ -282,13 +282,13 @@ class chumArea(RightClickTree):
@QtCore.pyqtSlot()
def beginNotify(self):
- print "BEGIN NOTIFY"
+ print("BEGIN NOTIFY")
self.notify = True
def getOptionsMenu(self):
if not self.currentItem():
return None
- text = unicode(self.currentItem().text(0))
+ text = str(self.currentItem().text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text == "Chums":
@@ -334,13 +334,13 @@ class chumArea(RightClickTree):
if thisitem.rfind(" (") != -1:
thisitem = thisitem[0:thisitem.rfind(" (")]
# Drop item is a group
- thisitem = unicode(event.source().currentItem().text(0))
+ thisitem = str(event.source().currentItem().text(0))
if thisitem.rfind(" (") != -1:
thisitem = thisitem[0:thisitem.rfind(" (")]
if thisitem == "Chums" or thisitem in self.groups:
droppos = self.itemAt(event.pos())
if not droppos: return
- droppos = unicode(droppos.text(0))
+ droppos = str(droppos.text(0))
if droppos.rfind(" ") != -1:
droppos = droppos[0:droppos.rfind(" ")]
if droppos == "Chums" or droppos in self.groups:
@@ -353,16 +353,16 @@ class chumArea(RightClickTree):
gTemp = []
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
- gTemp.append([unicode(text), self.topLevelItem(i).isExpanded()])
+ gTemp.append([str(text), self.topLevelItem(i).isExpanded()])
self.mainwindow.config.saveGroups(gTemp)
# Drop item is a chum
else:
item = self.itemAt(event.pos())
if item:
- text = unicode(item.text(0))
+ text = str(item.text(0))
# Figure out which group to drop into
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
@@ -370,7 +370,7 @@ class chumArea(RightClickTree):
group = text
gitem = item
else:
- ptext = unicode(item.parent().text(0))
+ ptext = str(item.parent().text(0))
if ptext.rfind(" ") != -1:
ptext = ptext[0:ptext.rfind(" ")]
group = ptext
@@ -393,7 +393,7 @@ class chumArea(RightClickTree):
if chums.index(thisitem) < inPos:
inPos -= 1
chums.remove(thisitem)
- chums.insert(inPos, unicode(thisitem))
+ chums.insert(inPos, str(thisitem))
self.mainwindow.config.setChums(chums)
else:
@@ -405,9 +405,9 @@ class chumArea(RightClickTree):
currentGroup = self.currentItem()
if currentGroup:
if currentGroup.parent():
- text = unicode(currentGroup.parent().text(0))
+ text = str(currentGroup.parent().text(0))
else:
- text = unicode(currentGroup.text(0))
+ text = str(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
currentGroup = text
@@ -465,7 +465,7 @@ class chumArea(RightClickTree):
return
curgroups = []
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
curgroups.append(text)
@@ -489,31 +489,31 @@ class chumArea(RightClickTree):
totals = {'Chums': 0}
online = {'Chums': 0}
for g in self.groups:
- totals[unicode(g)] = 0
- online[unicode(g)] = 0
+ totals[str(g)] = 0
+ online[str(g)] = 0
for c in self.chums:
yes = c.mood.name() != "offline"
if c.group == "Chums":
- totals[unicode(c.group)] = totals[unicode(c.group)]+1
+ totals[str(c.group)] = totals[str(c.group)]+1
if yes:
- online[unicode(c.group)] = online[unicode(c.group)]+1
+ online[str(c.group)] = online[str(c.group)]+1
elif c.group in totals:
- totals[unicode(c.group)] = totals[unicode(c.group)]+1
+ totals[str(c.group)] = totals[str(c.group)]+1
if yes:
- online[unicode(c.group)] = online[unicode(c.group)]+1
+ online[str(c.group)] = online[str(c.group)]+1
else:
totals["Chums"] = totals["Chums"]+1
if yes:
online["Chums"] = online["Chums"]+1
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text in online:
self.topLevelItem(i).setText(0, "%s (%i/%i)" % (text, online[text], totals[text]))
def hideOnlineNumbers(self):
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.topLevelItem(i).setText(0, "%s" % (text))
@@ -529,7 +529,7 @@ class chumArea(RightClickTree):
@QtCore.pyqtSlot()
def expandGroup(self):
item = self.currentItem()
- text = unicode(item.text(0))
+ text = str(item.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
@@ -544,7 +544,7 @@ class chumArea(RightClickTree):
self.mainwindow.config.addGroup("Chums")
curgroups = []
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
curgroups.append(text)
@@ -561,7 +561,7 @@ class chumArea(RightClickTree):
if self.openGroups[self.groups.index("%s" % (chumLabel.chum.group))]:
child_1.setExpanded(True)
for i in range(self.topLevelItemCount()):
- text = unicode(self.topLevelItem(i).text(0))
+ text = str(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text == chumLabel.chum.group:
@@ -580,7 +580,7 @@ class chumArea(RightClickTree):
bestname = ""
if fi > 0:
while not bestj:
- for j in xrange(self.topLevelItem(i).childCount()):
+ for j in range(self.topLevelItem(i).childCount()):
if chums[fi-c] == str(self.topLevelItem(i).child(j).text(0)):
bestj = j
bestname = chums[fi-c]
@@ -655,7 +655,7 @@ class chumArea(RightClickTree):
def initTheme(self, theme):
self.resize(*theme["main/chums/size"])
self.move(*theme["main/chums/loc"])
- if theme.has_key("main/chums/scrollbar"):
+ if "main/chums/scrollbar" in theme:
self.setStyleSheet("QListWidget { %s } QScrollBar { %s } QScrollBar::handle { %s } QScrollBar::add-line { %s } QScrollBar::sub-line { %s } QScrollBar:up-arrow { %s } QScrollBar:down-arrow { %s }" % (theme["main/chums/style"], theme["main/chums/scrollbar/style"], theme["main/chums/scrollbar/handle"], theme["main/chums/scrollbar/downarrow"], theme["main/chums/scrollbar/uparrow"], theme["main/chums/scrollbar/uarrowstyle"], theme["main/chums/scrollbar/darrowstyle"] ))
else:
self.setStyleSheet(theme["main/chums/style"])
@@ -763,7 +763,7 @@ class chumArea(RightClickTree):
return
(notes, ok) = QtWidgets.QInputDialog.getText(self, "Notes", "Enter your notes...")
if ok:
- notes = unicode(notes)
+ notes = str(notes)
self.mainwindow.chumdb.setNotes(currentChum.handle, notes)
currentChum.setToolTip(0, "%s: %s" % (currentChum.handle, notes))
@QtCore.pyqtSlot()
@@ -773,7 +773,7 @@ class chumArea(RightClickTree):
if not self.renamegroupdialog:
(gname, ok) = QtWidgets.QInputDialog.getText(self, "Rename Group", "Enter a new name for the group:")
if ok:
- gname = unicode(gname)
+ gname = str(gname)
if re.search("[^A-Za-z0-9_\s]", gname) is not None:
msgbox = QtWidgets.QMessageBox()
msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME")
@@ -787,7 +787,7 @@ class chumArea(RightClickTree):
index = self.indexOfTopLevelItem(currentGroup)
if index != -1:
expanded = currentGroup.isExpanded()
- text = unicode(currentGroup.text(0))
+ text = str(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.mainwindow.config.delGroup(text)
@@ -807,7 +807,7 @@ class chumArea(RightClickTree):
currentGroup = self.currentItem()
if not currentGroup:
return
- text = unicode(currentGroup.text(0))
+ text = str(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.mainwindow.config.delGroup(text)
@@ -830,7 +830,7 @@ class chumArea(RightClickTree):
def moveToGroup(self, item):
if not item:
return
- group = unicode(item.text())
+ group = str(item.text())
chumLabel = self.currentItem()
if not chumLabel:
return
@@ -945,7 +945,7 @@ class TrollSlumWindow(QtWidgets.QFrame):
self.addtrolldialog = QtWidgets.QInputDialog(self)
(handle, ok) = self.addtrolldialog.getText(self, "Add Troll", "Enter Troll Handle:")
if ok:
- handle = unicode(handle)
+ handle = str(handle)
if not (PesterProfile.checkLength(handle) and
PesterProfile.checkValid(handle)[0]):
errormsg = QtWidgets.QErrorMessage(self)
@@ -996,8 +996,9 @@ class PesterWindow(MovingWindow):
try:
themeChecker(self.theme)
- except ThemeException, (inst):
- print "Caught: "+inst.parameter
+ except ThemeException as xxx_todo_changeme:
+ (inst) = xxx_todo_changeme
+ print("Caught: "+inst.parameter)
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (inst))
themeWarning.exec_()
@@ -1157,7 +1158,7 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def updatePC(self):
- version.updateDownload(unicode(self.updatemenu.url))
+ version.updateDownload(str(self.updatemenu.url))
self.updatemenu = None
@QtCore.pyqtSlot()
def noUpdatePC(self):
@@ -1213,7 +1214,7 @@ class PesterWindow(MovingWindow):
return
# notify
if self.config.notifyOptions() & self.config.NEWMSG:
- if not self.convos.has_key(handle):
+ if handle not in self.convos:
t = self.tm.Toast("New Conversation", "From: %s" % handle)
t.show()
elif not self.config.notifyOptions() & self.config.NEWCONVO:
@@ -1231,7 +1232,7 @@ class PesterWindow(MovingWindow):
elif msg == "PESTERCHUM:UNBLOCK":
t = self.tm.Toast("Unblocked", handle)
t.show()
- if not self.convos.has_key(handle):
+ if handle not in self.convos:
if msg == "PESTERCHUM:CEASE": # ignore cease after we hang up
return
matchingChums = [c for c in self.chumList.chums if c.handle == handle]
@@ -1253,12 +1254,12 @@ class PesterWindow(MovingWindow):
else:
self.alarm.play()
def newMemoMsg(self, chan, handle, msg):
- if not self.memos.has_key(chan):
+ if chan not in self.memos:
# silently ignore in case we forgot to /part
return
memo = self.memos[chan]
- msg = unicode(msg)
- if not memo.times.has_key(handle):
+ msg = str(msg)
+ if handle not in memo.times:
# new chum! time current
newtime = timedelta(0)
time = TimeTracker(newtime)
@@ -1296,19 +1297,19 @@ class PesterWindow(MovingWindow):
def changeColor(self, handle, color):
# pesterconvo and chumlist
self.chumList.updateColor(handle, color)
- if self.convos.has_key(handle):
+ if handle in self.convos:
self.convos[handle].updateColor(color)
self.chumdb.setColor(handle, color)
def updateMood(self, handle, mood):
# updates OTHER chums' moods
oldmood = self.chumList.updateMood(handle, mood)
- if self.convos.has_key(handle):
+ if handle in self.convos:
self.convos[handle].updateMood(mood, old=oldmood)
if hasattr(self, 'trollslum') and self.trollslum:
self.trollslum.updateMood(handle, mood)
def newConversation(self, chum, initiated=True):
- if type(chum) in [str, unicode]:
+ if type(chum) in [str, str]:
matchingChums = [c for c in self.chumList.chums if c.handle == chum]
if len(matchingChums) > 0:
mood = matchingChums[0].mood
@@ -1318,7 +1319,7 @@ class PesterWindow(MovingWindow):
if len(matchingChums) == 0:
self.moodRequest.emit(chum)
- if self.convos.has_key(chum.handle):
+ if chum.handle in self.convos:
self.convos[chum.handle].showChat()
return
if self.config.tabs():
@@ -1331,10 +1332,10 @@ class PesterWindow(MovingWindow):
convoWindow.messageSent.connect(self.sendMessage)
convoWindow.windowClosed.connect(self.closeConvo)
self.convos[chum.handle] = convoWindow
- if unicode(chum.handle).upper() in BOTNAMES:
+ if str(chum.handle).upper() in BOTNAMES:
convoWindow.toggleQuirks(True)
convoWindow.quirksOff.setChecked(True)
- if unicode(chum.handle).upper() in CUSTOMBOTS:
+ if str(chum.handle).upper() in CUSTOMBOTS:
self.newConvoStarted.emit(chum.handle, initiated)
else:
self.newConvoStarted.emit(chum.handle, initiated)
@@ -1350,7 +1351,7 @@ class PesterWindow(MovingWindow):
def newMemo(self, channel, timestr, secret=False, invite=False):
if channel == "#pesterchum":
return
- if self.memos.has_key(channel):
+ if channel in self.memos:
self.memos[channel].showChat()
return
# do slider dialog then set
@@ -1465,19 +1466,19 @@ class PesterWindow(MovingWindow):
if hasattr(self, 'moods'):
self.moods.removeButtons()
mood_list = theme["main/moods"]
- mood_list = [dict([(str(k),v) for (k,v) in d.iteritems()])
+ mood_list = [dict([(str(k),v) for (k,v) in d.items()])
for d in mood_list]
self.moods = PesterMoodHandler(self, *[PesterMoodButton(self, **d) for d in mood_list])
self.moods.showButtons()
# chum
addChumStyle = "QPushButton { %s }" % (theme["main/addchum/style"])
- if theme.has_key("main/addchum/pressed"):
+ if "main/addchum/pressed" in theme:
addChumStyle += "QPushButton:pressed { %s }" % (theme["main/addchum/pressed"])
pesterButtonStyle = "QPushButton { %s }" % (theme["main/pester/style"])
- if theme.has_key("main/pester/pressed"):
+ if "main/pester/pressed" in theme:
pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/pester/pressed"])
blockButtonStyle = "QPushButton { %s }" % (theme["main/block/style"])
- if theme.has_key("main/block/pressed"):
+ if "main/block/pressed" in theme:
pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/block/pressed"])
self.addChumButton.setText(theme["main/addchum/text"])
self.addChumButton.resize(*theme["main/addchum/size"])
@@ -1502,7 +1503,7 @@ class PesterWindow(MovingWindow):
self.mychumcolor.resize(*theme["main/mychumhandle/colorswatch/size"])
self.mychumcolor.move(*theme["main/mychumhandle/colorswatch/loc"])
self.mychumcolor.setStyleSheet("background: %s" % (self.profile().colorhtml()))
- if self.theme.has_key("main/mychumhandle/currentMood"):
+ if "main/mychumhandle/currentMood" in self.theme:
moodicon = self.profile().mood.icon(theme)
if hasattr(self, 'currentMoodIcon') and self.currentMoodIcon:
self.currentMoodIcon.hide()
@@ -1536,7 +1537,7 @@ class PesterWindow(MovingWindow):
self.namesound = pygame.mixer.Sound("themes/namealarm.wav")
self.ceasesound = pygame.mixer.Sound(theme["main/sounds/ceasesound"])
self.honksound = pygame.mixer.Sound("themes/honk.wav")
- except Exception, e:
+ except Exception as e:
self.alarm = NoneSound()
self.memosound = NoneSound()
self.namesound = NoneSound()
@@ -1556,7 +1557,8 @@ class PesterWindow(MovingWindow):
# check theme
try:
themeChecker(theme)
- except ThemeException, (inst):
+ except ThemeException as xxx_todo_changeme1:
+ (inst) = xxx_todo_changeme1
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (inst))
themeWarning.exec_()
@@ -1639,7 +1641,7 @@ class PesterWindow(MovingWindow):
def pesterSelectedChum(self):
curChum = self.chumList.currentItem()
if curChum:
- text = unicode(curChum.text(0))
+ text = str(curChum.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text not in self.chumList.groups and \
@@ -1655,7 +1657,7 @@ class PesterWindow(MovingWindow):
self.newConversation(chum)
@QtCore.pyqtSlot('QString')
def closeConvo(self, handle):
- h = unicode(handle)
+ h = str(handle)
try:
chum = self.convos[h].chum
except KeyError:
@@ -1671,7 +1673,7 @@ class PesterWindow(MovingWindow):
del self.convos[h]
@QtCore.pyqtSlot('QString')
def closeMemo(self, channel):
- c = unicode(channel)
+ c = str(channel)
self.chatlog.finish(c)
self.leftChannel.emit(channel)
try:
@@ -1689,27 +1691,27 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString', Mood)
def updateMoodSlot(self, handle, mood):
- h = unicode(handle)
+ h = str(handle)
self.updateMood(h, mood)
@QtCore.pyqtSlot('QString', QtGui.QColor)
def updateColorSlot(self, handle, color):
- h = unicode(handle)
+ h = str(handle)
self.changeColor(h, color)
@QtCore.pyqtSlot('QString', 'QString')
def deliverMessage(self, handle, msg):
- h = unicode(handle)
- m = unicode(msg)
+ h = str(handle)
+ m = str(msg)
self.newMessage(h, m)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def deliverMemo(self, chan, handle, msg):
- (c, h, m) = (unicode(chan), unicode(handle), unicode(msg))
+ (c, h, m) = (str(chan), str(handle), str(msg))
self.newMemoMsg(c,h,m)
@QtCore.pyqtSlot('QString', 'QString')
def deliverNotice(self, handle, msg):
- h = unicode(handle)
- m = unicode(msg)
+ h = str(handle)
+ m = str(msg)
if m.startswith("Your nickname is now being changed to"):
changedto = m[39:-1]
msgbox = QtWidgets.QMessageBox()
@@ -1719,7 +1721,7 @@ class PesterWindow(MovingWindow):
ret = msgbox.exec_()
elif h == self.randhandler.randNick:
self.randhandler.incoming(msg)
- elif self.convos.has_key(h):
+ elif h in self.convos:
self.newMessage(h, m)
elif h.upper() == "NICKSERV" and "PESTERCHUM:" not in m:
m = nickservmsgs.translate(m)
@@ -1734,7 +1736,7 @@ class PesterWindow(MovingWindow):
msgbox.setStandardButtons(QtWidgets.QMessageBox.Ok | QtWidgets.QMessageBox.Cancel)
ret = msgbox.exec_()
if ret == QtWidgets.QMessageBox.Ok:
- self.newMemo(unicode(channel), "+0:00")
+ self.newMemo(str(channel), "+0:00")
@QtCore.pyqtSlot('QString')
def chanInviteOnly(self, channel):
self.inviteOnlyChan.emit(channel)
@@ -1746,35 +1748,35 @@ class PesterWindow(MovingWindow):
self.modesUpdated.emit(channel, modes)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def timeCommand(self, chan, handle, command):
- (c, h, cmd) = (unicode(chan), unicode(handle), unicode(command))
+ (c, h, cmd) = (str(chan), str(handle), str(command))
if self.memos[c]:
self.memos[c].timeUpdate(h, cmd)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def quirkDisable(self, channel, msg, op):
- (c, msg, op) = (unicode(channel), unicode(msg), unicode(op))
- if not self.memos.has_key(c):
+ (c, msg, op) = (str(channel), str(msg), str(op))
+ if c not in self.memos:
return
memo = self.memos[c]
memo.quirkDisable(op, msg)
@QtCore.pyqtSlot('QString', PesterList)
def updateNames(self, channel, names):
- c = unicode(channel)
+ c = str(channel)
# update name DB
self.namesdb[c] = names
# warn interested party of names
self.namesUpdated.emit(c)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def userPresentUpdate(self, handle, channel, update):
- c = unicode(channel)
- n = unicode(handle)
+ c = str(channel)
+ n = str(handle)
if update == "nick":
l = n.split(":")
oldnick = l[0]
newnick = l[1]
if update in ("quit", "netsplit"):
- for c in self.namesdb.keys():
+ for c in list(self.namesdb.keys()):
try:
i = self.namesdb[c].index(n)
self.namesdb[c].pop(i)
@@ -1791,7 +1793,7 @@ class PesterWindow(MovingWindow):
except KeyError:
self.namesdb[c] = []
elif update == "nick":
- for c in self.namesdb.keys():
+ for c in list(self.namesdb.keys()):
try:
i = self.namesdb[c].index(oldnick)
self.namesdb[c].pop(i)
@@ -1818,12 +1820,12 @@ class PesterWindow(MovingWindow):
available_groups = [g[0] for g in self.config.getGroups()]
self.addchumdialog = AddChumDialog(available_groups, self)
ok = self.addchumdialog.exec_()
- handle = unicode(self.addchumdialog.chumBox.text()).strip()
- newgroup = unicode(self.addchumdialog.newgroup.text()).strip()
+ handle = str(self.addchumdialog.chumBox.text()).strip()
+ newgroup = str(self.addchumdialog.newgroup.text()).strip()
selectedGroup = self.addchumdialog.groupBox.currentText()
group = newgroup if newgroup else selectedGroup
if ok:
- handle = unicode(handle)
+ handle = str(handle)
if handle in [h.handle for h in self.chumList.chums]:
self.addchumdialog = None
return
@@ -1855,10 +1857,10 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def blockChum(self, handle):
- h = unicode(handle)
+ h = str(handle)
self.config.addBlocklist(h)
self.config.removeChum(h)
- if self.convos.has_key(h):
+ if h in self.convos:
convo = self.convos[h]
msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/blocked"])
convo.textArea.append(convertTags(msg))
@@ -1873,9 +1875,9 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def unblockChum(self, handle):
- h = unicode(handle)
+ h = str(handle)
self.config.delBlocklist(h)
- if self.convos.has_key(h):
+ if h in self.convos:
convo = self.convos[h]
msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/unblocked"])
convo.textArea.append(convertTags(msg))
@@ -1896,7 +1898,7 @@ class PesterWindow(MovingWindow):
self.randhandler.setIdle(True)
sysColor = QtGui.QColor(self.theme["convo/systemMsgColor"])
verb = self.theme["convo/text/idle"]
- for (h, convo) in self.convos.iteritems():
+ for (h, convo) in self.convos.items():
if convo.chumopen:
msg = self.profile().idlemsg(sysColor, verb)
convo.textArea.append(convertTags(msg))
@@ -1930,7 +1932,7 @@ class PesterWindow(MovingWindow):
return
fp = open(f, 'r')
regexp_state = None
- for l in fp.xreadlines():
+ for l in fp:
# import chumlist
l = l.rstrip()
chum_mo = re.match("handle: ([A-Za-z0-9]+)", l)
@@ -1944,7 +1946,7 @@ class PesterWindow(MovingWindow):
replace = replace_mo.group(1)
try:
re.compile(regexp_state)
- except re.error, e:
+ except re.error as e:
continue
newquirk = pesterQuirk({"type": "regexp",
"from": regexp_state,
@@ -1980,18 +1982,18 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def joinSelectedMemo(self):
- time = unicode(self.memochooser.timeinput.text())
+ time = str(self.memochooser.timeinput.text())
secret = self.memochooser.secretChannel.isChecked()
invite = self.memochooser.inviteChannel.isChecked()
if self.memochooser.newmemoname():
newmemo = self.memochooser.newmemoname()
- channel = "#"+unicode(newmemo).replace(" ", "_")
+ channel = "#"+str(newmemo).replace(" ", "_")
channel = re.sub(r"[^A-Za-z0-9#_]", "", channel)
self.newMemo(channel, time, secret=secret, invite=invite)
for SelectedMemo in self.memochooser.SelectedMemos():
- channel = "#"+unicode(SelectedMemo.target)
+ channel = "#"+str(SelectedMemo.target)
self.newMemo(channel, time)
self.memochooser = None
@@ -2018,12 +2020,12 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def userListAdd(self, handle):
- h = unicode(handle)
+ h = str(handle)
chum = PesterProfile(h, chumdb=self.chumdb)
self.addChum(chum)
@QtCore.pyqtSlot('QString')
def userListPester(self, handle):
- h = unicode(handle)
+ h = str(handle)
self.newConversation(h)
@QtCore.pyqtSlot()
def userListClose(self):
@@ -2043,7 +2045,7 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def updateQuirks(self):
for i in range(self.quirkmenu.quirkList.topLevelItemCount()):
- curgroup = unicode(self.quirkmenu.quirkList.topLevelItem(i).text(0))
+ curgroup = str(self.quirkmenu.quirkList.topLevelItem(i).text(0))
for j in range(self.quirkmenu.quirkList.topLevelItem(i).childCount()):
item = self.quirkmenu.quirkList.topLevelItem(i).child(j)
item.quirk.quirk["on"] = item.quirk.on = (item.checkState(0) == QtCore.Qt.Checked)
@@ -2066,7 +2068,7 @@ class PesterWindow(MovingWindow):
(chum, ok) = QtWidgets.QInputDialog.getText(self, "Pester Chum", "Enter a handle to pester:")
try:
if ok:
- self.newConversation(unicode(chum))
+ self.newConversation(str(chum))
except:
pass
finally:
@@ -2094,7 +2096,7 @@ class PesterWindow(MovingWindow):
if not self.addgroupdialog:
(gname, ok) = QtWidgets.QInputDialog.getText(self, "Add Group", "Enter a name for the new group:")
if ok:
- gname = unicode(gname)
+ gname = str(gname)
if re.search("[^A-Za-z0-9_\s]", gname) is not None:
msgbox = QtWidgets.QMessageBox()
msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME")
@@ -2144,7 +2146,7 @@ class PesterWindow(MovingWindow):
# combine
self.createTabWindow()
newconvos = {}
- for (h,c) in self.convos.iteritems():
+ for (h,c) in self.convos.items():
c.setParent(self.tabconvo)
self.tabconvo.addChat(c)
self.tabconvo.show()
@@ -2174,7 +2176,7 @@ class PesterWindow(MovingWindow):
# combine
newmemos = {}
self.createMemoTabWindow()
- for (h,m) in self.memos.iteritems():
+ for (h,m) in self.memos.items():
m.setParent(self.tabmemo)
self.tabmemo.addChat(m)
self.tabmemo.show()
@@ -2223,7 +2225,7 @@ class PesterWindow(MovingWindow):
# timestamps
timestampsetting = self.optionmenu.timestampcheck.isChecked()
self.config.set("showTimeStamps", timestampsetting)
- timeformatsetting = unicode(self.optionmenu.timestampBox.currentText())
+ timeformatsetting = str(self.optionmenu.timestampBox.currentText())
if timeformatsetting == "12 hour":
self.config.set("time12Format", True)
else:
@@ -2333,7 +2335,7 @@ class PesterWindow(MovingWindow):
self.config.set('blink', blinksetting)
# toast notifications
self.tm.setEnabled(self.optionmenu.notifycheck.isChecked())
- self.tm.setCurrentType(unicode(self.optionmenu.notifyOptions.currentText()))
+ self.tm.setCurrentType(str(self.optionmenu.notifyOptions.currentText()))
notifysetting = 0
if self.optionmenu.notifySigninCheck.isChecked():
notifysetting |= self.config.SIGNIN
@@ -2373,7 +2375,7 @@ class PesterWindow(MovingWindow):
newmodes = self.optionmenu.modechange.text()
if newmodes:
self.setChannelMode.emit(self.profile().handle, newmodes, "")
- except Exception, e:
+ except Exception as e:
logging.error(e)
finally:
self.optionmenu = None
@@ -2400,13 +2402,13 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def themeSelected(self, override=False):
if not override:
- themename = unicode(self.optionmenu.themeBox.currentText())
+ themename = str(self.optionmenu.themeBox.currentText())
else:
themename = override
if override or themename != self.theme.name:
try:
self.changeTheme(pesterTheme(themename))
- except ValueError, e:
+ except ValueError as e:
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (e))
themeWarning.exec_()
@@ -2422,14 +2424,14 @@ class PesterWindow(MovingWindow):
def profileSelected(self):
if self.chooseprofile.profileBox and \
self.chooseprofile.profileBox.currentIndex() > 0:
- handle = unicode(self.chooseprofile.profileBox.currentText())
+ handle = str(self.chooseprofile.profileBox.currentText())
if handle == self.profile().handle:
self.chooseprofile = None
return
self.userprofile = userProfile(handle)
self.changeTheme(self.userprofile.getTheme())
else:
- handle = unicode(self.chooseprofile.chumHandle.text())
+ handle = str(self.chooseprofile.chumHandle.text())
if handle == self.profile().handle:
self.chooseprofile = None
return
@@ -2528,7 +2530,7 @@ class PesterWindow(MovingWindow):
if not hasattr(self, 'chooseprofile'):
self.chooseprofile = None
if not self.chooseprofile:
- h = unicode(handle)
+ h = str(handle)
self.changeProfile(collision=h)
@QtCore.pyqtSlot('QString')
def myHandleChanged(self, handle):
@@ -2621,10 +2623,10 @@ class MainProgram(QtCore.QObject):
try:
pygame.mixer.init()
pygame.mixer.init()
- except pygame.error, e:
- print "Warning: No sound! %s" % (e)
+ except pygame.error as e:
+ print("Warning: No sound! %s" % (e))
else:
- print "Warning: No sound!"
+ print("Warning: No sound!")
self.widget = PesterWindow(options, app=self.app)
self.widget.show()
@@ -2682,7 +2684,7 @@ class MainProgram(QtCore.QObject):
@QtCore.pyqtSlot()
def runUpdateSlot(self):
- q = Queue.Queue(1)
+ q = queue.Queue(1)
s = threading.Thread(target=version.updateCheck, args=(q,))
w = threading.Thread(target=self.showUpdate, args=(q,))
w.start()
@@ -2813,7 +2815,7 @@ Click this message to never see this again.")
for c in self.widget.tabmemo.convos:
self.irc.joinChannel(c)
else:
- for c in self.widget.memos.values():
+ for c in list(self.widget.memos.values()):
self.irc.joinChannel(c.channel)
return True
diff --git a/profile.py b/profile.py
index 64b6d28..6d5ef1e 100644
--- a/profile.py
+++ b/profile.py
@@ -41,17 +41,17 @@ class PesterLog(object):
if not self.parent.config.logPesters() & self.parent.config.LOG: return
if not self.parent.config.logPesters() & self.parent.config.STAMP:
time = ""
- if unicode(handle).upper() == "NICKSERV": return
+ if str(handle).upper() == "NICKSERV": return
#watch out for illegal characters
handle = re.sub(r'[<>:"/\\|?*]', "_", handle)
bbcodemsg = time + convertTags(msg, "bbcode")
html = time + convertTags(msg, "html")+"
"
msg = time +convertTags(msg, "text")
modes = {"bbcode": bbcodemsg, "html": html, "text": msg}
- if not self.convos.has_key(handle):
+ if handle not in self.convos:
time = datetime.now().strftime("%Y-%m-%d.%H.%M")
self.convos[handle] = {}
- for (format, t) in modes.iteritems():
+ for (format, t) in modes.items():
if not os.path.exists("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format)):
os.makedirs("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format))
try:
@@ -63,7 +63,7 @@ class PesterLog(object):
errmsg.show()
continue
self.convos[handle][format] = fp
- for (format, t) in modes.iteritems():
+ for (format, t) in modes.items():
f = self.convos[handle][format]
if platform.system() == "Windows":
f.write(t+"\r\n")
@@ -71,14 +71,14 @@ class PesterLog(object):
f.write(t+"\r\n")
f.flush()
def finish(self, handle):
- if not self.convos.has_key(handle):
+ if handle not in self.convos:
return
- for f in self.convos[handle].values():
+ for f in list(self.convos[handle].values()):
f.close()
del self.convos[handle]
def close(self):
- for h in self.convos.keys():
- for f in self.convos[h].values():
+ for h in list(self.convos.keys()):
+ for f in list(self.convos[h].values()):
f.close()
class userConfig(object):
@@ -100,7 +100,7 @@ class userConfig(object):
fp = open(self.filename)
self.config = json.load(fp)
fp.close()
- if self.config.has_key("defaultprofile"):
+ if "defaultprofile" in self.config:
self.userprofile = userProfile(self.config["defaultprofile"])
else:
self.userprofile = None
@@ -125,7 +125,7 @@ class userConfig(object):
fp.close()
def chums(self):
- if not self.config.has_key('chums'):
+ if 'chums' not in self.config:
self.set("chums", [])
return self.config.get('chums', [])
def setChums(self, newchums):
@@ -148,19 +148,19 @@ class userConfig(object):
def tabs(self):
return self.config.get("tabs", True)
def tabMemos(self):
- if not self.config.has_key('tabmemos'):
+ if 'tabmemos' not in self.config:
self.set("tabmemos", self.tabs())
return self.config.get("tabmemos", True)
def showTimeStamps(self):
- if not self.config.has_key('showTimeStamps'):
+ if 'showTimeStamps' not in self.config:
self.set("showTimeStamps", True)
return self.config.get('showTimeStamps', True)
def time12Format(self):
- if not self.config.has_key('time12Format'):
+ if 'time12Format' not in self.config:
self.set("time12Format", True)
return self.config.get('time12Format', True)
def showSeconds(self):
- if not self.config.has_key('showSeconds'):
+ if 'showSeconds' not in self.config:
self.set("showSeconds", False)
return self.config.get('showSeconds', False)
def sortMethod(self):
@@ -174,11 +174,11 @@ class userConfig(object):
return g[1]
return True
def showEmptyGroups(self):
- if not self.config.has_key('emptyGroups'):
+ if 'emptyGroups' not in self.config:
self.set("emptyGroups", False)
return self.config.get('emptyGroups', False)
def showOnlineNumbers(self):
- if not self.config.has_key('onlineNumbers'):
+ if 'onlineNumbers' not in self.config:
self.set("onlineNumbers", False)
return self.config.get('onlineNumbers', False)
def logPesters(self):
@@ -238,7 +238,7 @@ class userConfig(object):
newchums = [c for c in self.config['chums'] if c != handle]
self.set("chums", newchums)
def getBlocklist(self):
- if not self.config.has_key('block'):
+ if 'block' not in self.config:
self.set('block', [])
return self.config['block']
def addBlocklist(self, handle):
@@ -251,7 +251,7 @@ class userConfig(object):
l.pop(l.index(handle))
self.set('block', l)
def getGroups(self):
- if not self.groups.has_key('groups'):
+ if 'groups' not in self.groups:
self.saveGroups([["Chums", True]])
return self.groups.get('groups', [["Chums", True]])
def addGroup(self, group, open=True):
@@ -285,7 +285,7 @@ class userConfig(object):
self.groups['groups'] = groups
try:
jsonoutput = json.dumps(self.groups)
- except ValueError, e:
+ except ValueError as e:
raise e
fp = open("%s/groups.js" % (self.logpath), 'w')
fp.write(jsonoutput)
@@ -300,7 +300,7 @@ class userConfig(object):
return self.parent.portOverride
return self.config.get('port', '6667')
def soundOn(self):
- if not self.config.has_key('soundon'):
+ if 'soundon' not in self.config:
self.set('soundon', True)
return self.config['soundon']
def chatSound(self):
@@ -319,7 +319,7 @@ class userConfig(object):
self.config[item] = setting
try:
jsonoutput = json.dumps(self.config)
- except ValueError, e:
+ except ValueError as e:
raise e
fp = open(self.filename, 'w')
fp.write(jsonoutput)
@@ -356,7 +356,7 @@ class userProfile(object):
if type(user) is PesterProfile:
self.chat = user
self.userprofile = {"handle":user.handle,
- "color": unicode(user.color.name()),
+ "color": str(user.color.name()),
"quirks": [],
"theme": "pesterchum"}
self.theme = pesterTheme("pesterchum")
@@ -377,7 +377,7 @@ class userProfile(object):
fp.close()
try:
self.theme = pesterTheme(self.userprofile["theme"])
- except ValueError, e:
+ except ValueError as e:
self.theme = pesterTheme("pesterchum")
self.lastmood = self.userprofile.get('lastmood', self.theme["main/defaultmood"])
self.chat = PesterProfile(self.userprofile["handle"],
@@ -402,7 +402,7 @@ class userProfile(object):
try:
with open(_datadir+"passwd.js") as fp:
self.passwd = json.load(fp)
- except Exception, e:
+ except Exception as e:
self.passwd = {}
self.autoidentify = False
self.nickservpass = ""
@@ -418,7 +418,7 @@ class userProfile(object):
self.save()
def setColor(self, color):
self.chat.color = color
- self.userprofile["color"] = unicode(color.name())
+ self.userprofile["color"] = str(color.name())
self.save()
def setQuirks(self, quirks):
self.quirks = quirks
@@ -436,7 +436,7 @@ class userProfile(object):
try:
for (i,m) in enumerate(mentions):
re.compile(m)
- except re.error, e:
+ except re.error as e:
logging.error("#%s Not a valid regular expression: %s" % (i, e))
else:
self.mentions = mentions
@@ -479,19 +479,19 @@ class userProfile(object):
return
try:
jsonoutput = json.dumps(self.userprofile)
- except ValueError, e:
+ except ValueError as e:
raise e
fp = open("%s/%s.js" % (self.profiledir, handle), 'w')
fp.write(jsonoutput)
fp.close()
def saveNickServPass(self):
# remove profiles with no passwords
- for h,t in self.passwd.items():
+ for h,t in list(self.passwd.items()):
if "auto" not in t or "pw" not in t or t["pw"] == "":
del self.passwd[h]
try:
jsonoutput = json.dumps(self.passwd, indent=4)
- except ValueError, e:
+ except ValueError as e:
raise e
with open(_datadir+"passwd.js", 'w') as fp:
fp.write(jsonoutput)
@@ -526,7 +526,7 @@ class PesterProfileDB(dict):
fp.close()
u = []
- for (handle, c) in chumdict.iteritems():
+ for (handle, c) in chumdict.items():
options = dict()
if 'group' in c:
options['group'] = c['group']
@@ -543,39 +543,39 @@ class PesterProfileDB(dict):
def save(self):
try:
fp = open("%s/chums.js" % (self.logpath), 'w')
- chumdict = dict([p.plaindict() for p in self.itervalues()])
+ chumdict = dict([p.plaindict() for p in self.values()])
json.dump(chumdict, fp)
fp.close()
- except Exception, e:
+ except Exception as e:
raise e
def getColor(self, handle, default=None):
- if not self.has_key(handle):
+ if handle not in self:
return default
else:
return self[handle].color
def setColor(self, handle, color):
- if self.has_key(handle):
+ if handle in self:
self[handle].color = color
else:
self[handle] = PesterProfile(handle, color)
def getGroup(self, handle, default="Chums"):
- if not self.has_key(handle):
+ if handle not in self:
return default
else:
return self[handle].group
def setGroup(self, handle, theGroup):
- if self.has_key(handle):
+ if handle in self:
self[handle].group = theGroup
else:
self[handle] = PesterProfile(handle, group=theGroup)
self.save()
def getNotes(self, handle, default=""):
- if not self.has_key(handle):
+ if handle not in self:
return default
else:
return self[handle].notes
def setNotes(self, handle, notes):
- if self.has_key(handle):
+ if handle in self:
self[handle].notes = notes
else:
self[handle] = PesterProfile(handle, notes=notes)
@@ -604,7 +604,7 @@ class pesterTheme(dict):
except IOError:
theme = json.loads("{}")
self.update(theme)
- if self.has_key("inherits"):
+ if "inherits" in self:
self.inheritedTheme = pesterTheme(self["inherits"])
if not default:
self.defaultTheme = pesterTheme("pesterchum", default=True)
@@ -612,7 +612,7 @@ class pesterTheme(dict):
keys = key.split("/")
try:
v = dict.__getitem__(self, keys.pop(0))
- except KeyError, e:
+ except KeyError as e:
if hasattr(self, 'inheritedTheme'):
return self.inheritedTheme[key]
if hasattr(self, 'defaultTheme'):
@@ -622,7 +622,7 @@ class pesterTheme(dict):
for k in keys:
try:
v = v[k]
- except KeyError, e:
+ except KeyError as e:
if hasattr(self, 'inheritedTheme'):
return self.inheritedTheme[key]
if hasattr(self, 'defaultTheme'):
@@ -631,8 +631,8 @@ class pesterTheme(dict):
raise e
return v
def pathHook(self, d):
- for (k, v) in d.iteritems():
- if type(v) is unicode:
+ for (k, v) in d.items():
+ if type(v) is str:
s = Template(v)
d[k] = s.safe_substitute(path=self.path)
return d
@@ -658,6 +658,6 @@ class pesterTheme(dict):
return False if v is None else True
except KeyError:
if hasattr(self, 'inheritedTheme'):
- return self.inheritedTheme.has_key(key)
+ return key in self.inheritedTheme
else:
return False
diff --git a/pyquirks.py b/pyquirks.py
index f4a5b37..e275cc5 100644
--- a/pyquirks.py
+++ b/pyquirks.py
@@ -12,20 +12,20 @@ class PythonQuirks(ScriptQuirks):
def modHas(self, module, attr):
if attr == 'commands':
variables = vars(module)
- for name, obj in variables.iteritems():
+ for name, obj in variables.items():
if self.modHas(obj, 'command'):
return True
return hasattr(module, attr)
def register(self, module):
variables = vars(module)
- for name, obj in variables.iteritems():
+ for name, obj in variables.items():
if self.modHas(obj, 'command'):
try:
- if not isinstance(obj("test"), basestring):
+ if not isinstance(obj("test"), str):
raise Exception
except:
- print "Quirk malformed: %s" % (obj.command)
+ print("Quirk malformed: %s" % (obj.command))
msgbox = QtWidgets.QMessageBox()
msgbox.setWindowTitle("Error!")
msgbox.setText("Quirk malformed: %s" % (obj.command))
diff --git a/quirks.py b/quirks.py
index 7499abe..2863918 100644
--- a/quirks.py
+++ b/quirks.py
@@ -20,7 +20,7 @@ class ScriptQuirks(object):
self.last = self.quirks.copy()
self.quirks.clear()
for script in self.scripts:
- print script.getExtension()
+ print(script.getExtension())
script.load()
#print script.quirks
for q in script.quirks:
@@ -31,9 +31,9 @@ class ScriptQuirks(object):
del self.quirks[k]
#print self.quirks
if self.quirks:
- print 'Registered quirks:', '(), '.join(self.quirks) + "()"
+ print('Registered quirks:', '(), '.join(self.quirks) + "()")
else:
- print "Warning: Couldn't find any script quirks"
+ print("Warning: Couldn't find any script quirks")
def add(self, script):
self.scripts.append(script)
@@ -64,8 +64,8 @@ class ScriptQuirks(object):
module = self.loadModule(name, filename)
if module is None:
continue
- except Exception, e:
- print "Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e)
+ except Exception as e:
+ print("Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e))
msgbox = QtWidgets.QMessageBox()
msgbox.setWindowTitle("Error!")
msgbox.setText("Error loading %s: %s (in quirks.py)" % (os.path.basename(filename), e))
diff --git a/randomer.py b/randomer.py
index af60239..4df2b04 100644
--- a/randomer.py
+++ b/randomer.py
@@ -63,6 +63,6 @@ class RandomHandler(QtCore.QObject):
msgbox.setInformativeText("Try again later :(")
msgbox.exec_()
return
- name = unicode(l[1])
- print name
+ name = str(l[1])
+ print(name)
self.mainwindow.newConversation(name)
diff --git a/toast.py b/toast.py
index c040d62..2d89cad 100644
--- a/toast.py
+++ b/toast.py
@@ -4,27 +4,29 @@ import time, os
import ostools
from PyQt5 import QtGui, QtCore, QtWidgets
+import logging
+
try:
import pynotify
except:
pynotify = None
-class DefaultToast(object):
+class DefaultToast(QtWidgets.QWidget):
def __init__(self, parent, **kwds):
- super(DefaultToast, self).__init__(parent, **kwds)
+ super().__init__(parent)
self.machine = kwds.get('machine')
self.title = kwds.get('title')
self.msg = kwds.get('msg')
self.icon = kwds.get('icon')
def show(self):
- print self.title, self.msg, self.icon
+ print(self.title, self.msg, self.icon)
self.done()
def done(self):
t = self.machine.toasts[0]
if t.title == self.title and t.msg == self.msg and t.icon == self.icon:
self.machine.toasts.pop(0)
self.machine.displaying = False
- print "Done"
+ print("Done")
class ToastMachine(object):
class __Toast__(object):
@@ -73,7 +75,7 @@ class ToastMachine(object):
def realShow(self):
self.machine.displaying = True
t = None
- for (k,v) in self.machine.types.iteritems():
+ for (k,v) in self.machine.types.items():
if self.machine.type == k:
try:
args = inspect.getargspec(v.__init__).args
@@ -143,15 +145,15 @@ class ToastMachine(object):
if type in self.types:
if type == "libnotify":
if not pynotify or not pynotify.init("ToastMachine"):
- print "Problem initilizing pynotify"
+ print("Problem initilizing pynotify")
return
#self.type = type = "default"
elif type == "twmn":
from libs import pytwmn
try:
pytwmn.init()
- except pytwmn.ERROR, e:
- print "Problem initilizing pytwmn: " + str(e)
+ except pytwmn.ERROR as e:
+ print("Problem initilizing pytwmn: " + str(e))
return
#self.type = type = "default"
self.type = type
@@ -177,9 +179,11 @@ class ToastMachine(object):
self.showNext()
-class PesterToast(QtWidgets.QWidget, DefaultToast):
+class PesterToast(DefaultToast):
def __init__(self, machine, title, msg, icon, time=3000, parent=None):
- super(PesterToast, self).__init__(self, parent, machine=machine, title=title, msg=msg, icon=icon)
+ logging.info(isinstance(parent, QtWidgets.QWidget))
+ kwds = dict(machine=machine, title=title, msg=msg, icon=icon)
+ super().__init__(parent, **kwds)
self.machine = machine
self.time = time
@@ -210,7 +214,6 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
self.icon.pixmap().fill(QtGui.QColor(0,0,0,0))
layout_0 = QtWidgets.QVBoxLayout()
- layout_0.setMargin(0)
layout_0.setContentsMargins(0, 0, 0, 0)
if self.icon:
@@ -237,7 +240,7 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
self.msg.setStyleSheet(self.parent().theme["toasts/content/style"])
self.layout().setSpacing(0)
- self.msg.setText(PesterToast.wrapText(self.msg.font(), unicode(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"]))
+ self.msg.setText(PesterToast.wrapText(self.msg.font(), str(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"]))
p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight()
o = QtWidgets.QApplication.desktop().screenGeometry(self).bottomRight()
@@ -255,8 +258,8 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
def done(self):
QtWidgets.QWidget.hide(self)
t = self.machine.toasts[0]
- if t.title == unicode(self.title.text()) and \
- t.msg == unicode(self.content):
+ if t.title == str(self.title.text()) and \
+ t.msg == str(self.content):
self.machine.toasts.pop(0)
self.machine.displaying = False
if self.machine.on:
@@ -266,7 +269,7 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
@QtCore.pyqtSlot()
def reverseTrigger(self):
if self.time >= 0:
- QtCore.QTimer.singleShot(self.time, self, QtCore.SLOT('reverseStart()'))
+ QtCore.QTimer.singleShot(self.time, self.reverseStart)
@QtCore.pyqtSlot()
def reverseStart(self):
@@ -283,7 +286,7 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
def updateBottomLeftAnimation(self, value):
p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight()
val = float(self.height())/100
- self.move(p.x()-self.width(), p.y() - (value.toInt()[0] * val) +1)
+ self.move(p.x()-self.width(), p.y() - (value * val) +1)
self.layout().setSpacing(0)
QtWidgets.QWidget.show(self)
@@ -349,7 +352,7 @@ class PesterToast(QtWidgets.QWidget, DefaultToast):
break
if (metric.width(text[:lastspace]) > maxwidth) or \
len(text[:lastspace]) < 1:
- for i in xrange(len(text)):
+ for i in range(len(text)):
if metric.width(text[:i]) > maxwidth:
lastspace = i-1
break
diff --git a/updatecheck.py b/updatecheck.py
index b5dfd5b..bba4302 100644
--- a/updatecheck.py
+++ b/updatecheck.py
@@ -34,19 +34,20 @@ class MSPAChecker(QtWidgets.QWidget):
raise
if os.path.exists("status_old.pkl"):
os.remove("status_old.pkl")
- except Exception, e:
- print e
+ except Exception as e:
+ print(e)
msg = QtWidgets.QMessageBox(self)
msg.setText("Problems writing save file.")
msg.show()
@QtCore.pyqtSlot()
def check_site_wrapper(self):
+ return # turn off MSPA check; python3 doesnt like it
if not self.mainwindow.config.checkMSPA():
return
if self.lock:
return
- print "Checking MSPA updates..."
+ print("Checking MSPA updates...")
s = threading.Thread(target=self.check_site)
s.start()
@@ -88,7 +89,7 @@ class MSPAChecker(QtWidgets.QWidget):
@QtCore.pyqtSlot()
def visit_site(self):
- print self.status['last_visited']['link']
+ print(self.status['last_visited']['link'])
QtGui.QDesktopServices.openUrl(QtCore.QUrl(self.status['last_visited']['link'], QtCore.QUrl.TolerantMode))
if self.status['last_seen']['pubdate'] > self.status['last_visited']['pubdate']:
#Visited for the first time. Untrip the icon and remember that we saw it.
diff --git a/version.py b/version.py
index 59b0d76..b9acc98 100644
--- a/version.py
+++ b/version.py
@@ -1,4 +1,4 @@
-import urllib
+import urllib.request, urllib.parse, urllib.error
import re
import time
try:
@@ -67,31 +67,31 @@ def lexVersion(short=False):
# Naughty I know, but it lets me grab it from the bash script.
if __name__ == "__main__":
- print lexVersion()
+ print(lexVersion())
def verStrToNum(ver):
w = re.match("(\d+\.?\d+)\.(\d+)-?([A-Za-z]{0,2})\.?(\d*):(\S+)", ver)
if not w:
- print "Update check Failure: 3"; return
+ print("Update check Failure: 3"); return
full = ver[:ver.find(":")]
return full,w.group(1),w.group(2),w.group(3),w.group(4),w.group(5)
def updateCheck(q):
time.sleep(3)
- data = urllib.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE})
+ data = urllib.parse.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE})
try:
- f = urllib.urlopen("http://distantsphere.com/pesterchum.php?" + data)
+ f = urllib.request.urlopen("http://distantsphere.com/pesterchum.php?" + data)
except:
- print "Update check Failure: 1"; return q.put((False,1))
+ print("Update check Failure: 1"); return q.put((False,1))
newest = f.read()
f.close()
if not newest or newest[0] == "<":
- print "Update check Failure: 2"; return q.put((False,2))
+ print("Update check Failure: 2"); return q.put((False,2))
try:
(full, major, minor, status, revision, url) = verStrToNum(newest)
except TypeError:
return q.put((False,3))
- print full
+ print(full)
if major <= _pcMajor:
if minor <= _pcMinor:
if status:
@@ -102,7 +102,7 @@ def updateCheck(q):
if not _pcStatus:
if revision <= _pcRevision:
return q.put((False,0))
- print "A new version of Pesterchum is avaliable!"
+ print("A new version of Pesterchum is avaliable!")
q.put((full,url))
@@ -128,9 +128,9 @@ def copyUpdate(path):
def updateExtract(url, extension):
if extension:
fn = "update" + extension
- urllib.urlretrieve(url, fn)
+ urllib.request.urlretrieve(url, fn)
else:
- fn = urllib.urlretrieve(url)[0]
+ fn = urllib.request.urlretrieve(url)[0]
if tarfile and tarfile.is_tarfile(fn):
extension = ".tar.gz"
elif zipfile.is_zipfile(fn):
@@ -144,17 +144,17 @@ def updateExtract(url, extension):
except:
pass
- print url, fn, extension
+ print(url, fn, extension)
if extension == ".exe":
pass
elif extension == ".zip" or extension.startswith(".tar"):
if extension == ".zip":
from zipfile import is_zipfile as is_updatefile, ZipFile as openupdate
- print "Opening .zip"
+ print("Opening .zip")
elif tarfile and extension.startswith(".tar"):
from tarfile import is_tarfile as is_updatefile, open as openupdate
- print "Opening .tar"
+ print("Opening .tar")
else:
return