':
- depth = 0
- for piece in pieces[:-1]:
- if piece.startswith(''):
- depth -= 1
- if depth == 0:
- break
- elif piece.startswith('<') and not piece.endswith('/>'):
- depth += 1
- else:
- pieces = pieces[1:-1]
-
- # Ensure each piece is a str for Python 3
- for (i, v) in enumerate(pieces):
- if not isinstance(v, str):
- pieces[i] = v.decode('utf-8')
-
output = ''.join(pieces)
if stripWhitespace:
output = output.strip()
- if not expectingText:
- return output
+ if not expectingText: return output
# decode base64 content
if base64 and self.contentparams.get('base64', 0):
try:
- output = _base64decode(output)
+ output = base64.decodestring(output)
except binascii.Error:
pass
except binascii.Incomplete:
pass
- except TypeError:
- # In Python 3, base64 takes and outputs bytes, not str
- # This may not be the most correct way to accomplish this
- output = _base64decode(output.encode('utf-8')).decode('utf-8')
-
+
# resolve relative URIs
if (element in self.can_be_relative_uri) and output:
output = self.resolveURI(output)
-
+
# decode entities within embedded markup
if not self.contentparams.get('base64', 0):
output = self.decodeEntities(element, output)
- # some feed formats require consumers to guess
- # whether the content is html or plain text
- if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain':
- if self.lookslikehtml(output):
- self.contentparams['type'] = 'text/html'
-
# remove temporary cruft from contentparams
try:
del self.contentparams['mode']
@@ -914,55 +635,26 @@ class _FeedParserMixin:
except KeyError:
pass
- is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup
- if is_htmlish and RESOLVE_RELATIVE_URIS:
+ if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
if element in self.can_contain_relative_uris:
- output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
-
- # parse microformats
- # (must do this before sanitizing because some microformats
- # rely on elements that we sanitize)
- if PARSE_MICROFORMATS and is_htmlish and element in ['content', 'description', 'summary']:
- mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
- if mfresults:
- for tag in mfresults.get('tags', []):
- self._addTag(tag['term'], tag['scheme'], tag['label'])
- for enclosure in mfresults.get('enclosures', []):
- self._start_enclosure(enclosure)
- for xfn in mfresults.get('xfn', []):
- self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
- vcard = mfresults.get('vcard')
- if vcard:
- self._getContext()['vcard'] = vcard
-
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
+
# sanitize embedded markup
- if is_htmlish and SANITIZE_HTML:
+ if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
if element in self.can_contain_dangerous_markup:
- output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
+ output = _sanitizeHTML(output, self.encoding)
- if self.encoding and not isinstance(output, str):
- output = output.decode(self.encoding, 'ignore')
-
- # address common error where people take data that is already
- # utf-8, presume that it is iso-8859-1, and re-encode it.
- if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and isinstance(output, str):
+ if self.encoding and type(output) != type(u''):
try:
- output = output.encode('iso-8859-1').decode('utf-8')
- except (UnicodeEncodeError, UnicodeDecodeError):
+ output = unicode(output, self.encoding)
+ except:
pass
- # map win-1252 extensions to the proper code points
- if isinstance(output, str):
- output = output.translate(_cp1252)
-
# categories/tags/keywords/whatever are handled in _end_category
if element == 'category':
return output
-
- if element == 'title' and -1 < self.title_depth <= self.depth:
- return output
-
+
# store output in appropriate place(s)
if self.inentry and not self.insource:
if element == 'content':
@@ -971,34 +663,23 @@ class _FeedParserMixin:
contentparams['value'] = output
self.entries[-1][element].append(contentparams)
elif element == 'link':
- if not self.inimage:
- # query variables in urls in link elements are improperly
- # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
- # unhandled character references. fix this special case.
- output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
- self.entries[-1][element] = output
- if output:
- self.entries[-1]['links'][-1]['href'] = output
+ self.entries[-1][element] = output
+ if output:
+ self.entries[-1]['links'][-1]['href'] = output
else:
if element == 'description':
element = 'summary'
- old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
- if old_value_depth is None or self.depth <= old_value_depth:
- self.property_depth_map[self.entries[-1]][element] = self.depth
- self.entries[-1][element] = output
+ self.entries[-1][element] = output
if self.incontent:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element + '_detail'] = contentparams
- elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
+ elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
context = self._getContext()
if element == 'description':
element = 'subtitle'
context[element] = output
if element == 'link':
- # fix query variables; see above for the explanation
- output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
- context[element] = output
context['links'][-1]['href'] = output
elif self.incontent:
contentparams = copy.deepcopy(self.contentparams)
@@ -1008,8 +689,6 @@ class _FeedParserMixin:
def pushContent(self, tag, attrsD, defaultContentType, expectingText):
self.incontent += 1
- if self.lang:
- self.lang=self.lang.replace('_','-')
self.contentparams = FeedParserDict({
'type': self.mapContentType(attrsD.get('type', defaultContentType)),
'language': self.lang,
@@ -1022,36 +701,16 @@ class _FeedParserMixin:
self.incontent -= 1
self.contentparams.clear()
return value
-
- # a number of elements in a number of RSS variants are nominally plain
- # text, but this is routinely ignored. This is an attempt to detect
- # the most common cases. As false positives often result in silent
- # data loss, this function errs on the conservative side.
- @staticmethod
- def lookslikehtml(s):
- # must have a close tag or an entity reference to qualify
- if not (re.search(r'(\w+)>',s) or re.search("?\w+;",s)):
- return
-
- # all tags must be in a restricted subset of valid HTML tags
- if [t for t in re.findall(r'?(\w+)',s) if t.lower() not in _HTMLSanitizer.acceptable_elements]:
- return
-
- # all entities must have been defined as valid HTML entities
- if [e for e in re.findall(r'&(\w+);', s) if e not in list(entitydefs.keys())]:
- return
-
- return 1
-
+
def _mapToStandardPrefix(self, name):
colonpos = name.find(':')
- if colonpos != -1:
+ if colonpos <> -1:
prefix = name[:colonpos]
suffix = name[colonpos+1:]
prefix = self.namespacemap.get(prefix, prefix)
name = prefix + ':' + suffix
return name
-
+
def _getAttribute(self, attrsD, name):
return attrsD.get(self._mapToStandardPrefix(name))
@@ -1079,23 +738,17 @@ class _FeedParserMixin:
pass
attrsD['href'] = href
return attrsD
-
- def _save(self, key, value, overwrite=False):
+
+ def _save(self, key, value):
context = self._getContext()
- if overwrite:
- context[key] = value
- else:
- context.setdefault(key, value)
+ context.setdefault(key, value)
def _start_rss(self, attrsD):
versionmap = {'0.91': 'rss091u',
'0.92': 'rss092',
'0.93': 'rss093',
'0.94': 'rss094'}
- #If we're here then this is an RSS feed.
- #If we don't have a version or have a version that starts with something
- #other than RSS then there's been a mistake. Correct it.
- if not self.version or not self.version.startswith('rss'):
+ if not self.version:
attr_version = attrsD.get('version', '')
version = versionmap.get(attr_version)
if version:
@@ -1104,21 +757,25 @@ class _FeedParserMixin:
self.version = 'rss20'
else:
self.version = 'rss'
+
+ def _start_dlhottitles(self, attrsD):
+ self.version = 'hotrss'
def _start_channel(self, attrsD):
self.infeed = 1
self._cdf_common(attrsD)
+ _start_feedinfo = _start_channel
def _cdf_common(self, attrsD):
- if 'lastmod' in attrsD:
+ if attrsD.has_key('lastmod'):
self._start_modified({})
self.elementstack[-1][-1] = attrsD['lastmod']
self._end_modified()
- if 'href' in attrsD:
+ if attrsD.has_key('href'):
self._start_link({})
self.elementstack[-1][-1] = attrsD['href']
self._end_link()
-
+
def _start_feed(self, attrsD):
self.infeed = 1
versionmap = {'0.1': 'atom01',
@@ -1135,27 +792,24 @@ class _FeedParserMixin:
def _end_channel(self):
self.infeed = 0
_end_feed = _end_channel
-
+
def _start_image(self, attrsD):
- context = self._getContext()
- if not self.inentry:
- context.setdefault('image', FeedParserDict())
self.inimage = 1
- self.title_depth = -1
self.push('image', 0)
-
+ context = self._getContext()
+ context.setdefault('image', FeedParserDict())
+
def _end_image(self):
self.pop('image')
self.inimage = 0
def _start_textinput(self, attrsD):
+ self.intextinput = 1
+ self.push('textinput', 0)
context = self._getContext()
context.setdefault('textinput', FeedParserDict())
- self.intextinput = 1
- self.title_depth = -1
- self.push('textinput', 0)
_start_textInput = _start_textinput
-
+
def _end_textinput(self):
self.pop('textinput')
self.intextinput = 0
@@ -1164,10 +818,6 @@ class _FeedParserMixin:
def _start_author(self, attrsD):
self.inauthor = 1
self.push('author', 1)
- # Append a new FeedParserDict when expecting an author
- context = self._getContext()
- context.setdefault('authors', [])
- context['authors'].append(FeedParserDict())
_start_managingeditor = _start_author
_start_dc_author = _start_author
_start_dc_creator = _start_author
@@ -1227,7 +877,7 @@ class _FeedParserMixin:
self._save_contributor('name', value)
elif self.intextinput:
context = self._getContext()
- context['name'] = value
+ context['textinput']['name'] = value
_end_itunes_name = _end_name
def _start_width(self, attrsD):
@@ -1237,11 +887,11 @@ class _FeedParserMixin:
value = self.pop('width')
try:
value = int(value)
- except ValueError:
+ except:
value = 0
if self.inimage:
context = self._getContext()
- context['width'] = value
+ context['image']['width'] = value
def _start_height(self, attrsD):
self.push('height', 0)
@@ -1250,11 +900,11 @@ class _FeedParserMixin:
value = self.pop('height')
try:
value = int(value)
- except ValueError:
+ except:
value = 0
if self.inimage:
context = self._getContext()
- context['height'] = value
+ context['image']['height'] = value
def _start_url(self, attrsD):
self.push('href', 1)
@@ -1267,6 +917,12 @@ class _FeedParserMixin:
self._save_author('href', value)
elif self.incontributor:
self._save_contributor('href', value)
+ elif self.inimage:
+ context = self._getContext()
+ context['image']['href'] = value
+ elif self.intextinput:
+ context = self._getContext()
+ context['textinput']['link'] = value
_end_homepage = _end_url
_end_uri = _end_url
@@ -1287,10 +943,6 @@ class _FeedParserMixin:
def _getContext(self):
if self.insource:
context = self.sourcedata
- elif self.inimage and 'image' in self.feeddata:
- context = self.feeddata['image']
- elif self.intextinput:
- context = self.feeddata['textinput']
elif self.inentry:
context = self.entries[-1]
else:
@@ -1302,8 +954,6 @@ class _FeedParserMixin:
context.setdefault(prefix + '_detail', FeedParserDict())
context[prefix + '_detail'][key] = value
self._sync_author_detail()
- context.setdefault('authors', [FeedParserDict()])
- context['authors'][-1][key] = value
def _save_contributor(self, key, value):
context = self._getContext()
@@ -1323,29 +973,23 @@ class _FeedParserMixin:
elif email:
context[key] = email
else:
- author, email = context.get(key), None
- if not author:
- return
- emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
- if emailmatch:
- email = emailmatch.group(0)
- # probably a better way to do the following, but it passes all the tests
- author = author.replace(email, '')
- author = author.replace('()', '')
- author = author.replace('<>', '')
- author = author.replace('<>', '')
- author = author.strip()
- if author and (author[0] == '('):
- author = author[1:]
- if author and (author[-1] == ')'):
- author = author[:-1]
- author = author.strip()
- if author or email:
- context.setdefault('%s_detail' % key, FeedParserDict())
- if author:
- context['%s_detail' % key]['name'] = author
- if email:
- context['%s_detail' % key]['email'] = email
+ author = context.get(key)
+ if not author: return
+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
+ if not emailmatch: return
+ email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
+ author = author.replace(email, '')
+ author = author.replace('()', '')
+ author = author.strip()
+ if author and (author[0] == '('):
+ author = author[1:]
+ if author and (author[-1] == ')'):
+ author = author[:-1]
+ author = author.strip()
+ context.setdefault('%s_detail' % key, FeedParserDict())
+ context['%s_detail' % key]['name'] = author
+ context['%s_detail' % key]['email'] = email
def _start_subtitle(self, attrsD):
self.pushContent('subtitle', attrsD, 'text/plain', 1)
@@ -1356,7 +1000,7 @@ class _FeedParserMixin:
self.popContent('subtitle')
_end_tagline = _end_subtitle
_end_itunes_subtitle = _end_subtitle
-
+
def _start_rights(self, attrsD):
self.pushContent('rights', attrsD, 'text/plain', 1)
_start_dc_rights = _start_rights
@@ -1372,13 +1016,13 @@ class _FeedParserMixin:
self.push('item', 0)
self.inentry = 1
self.guidislink = 0
- self.title_depth = -1
id = self._getAttribute(attrsD, 'rdf:about')
if id:
context = self._getContext()
context['id'] = id
self._cdf_common(attrsD)
_start_entry = _start_item
+ _start_product = _start_item
def _end_item(self):
self.pop('item')
@@ -1406,30 +1050,28 @@ class _FeedParserMixin:
self.push('published', 1)
_start_dcterms_issued = _start_published
_start_issued = _start_published
- _start_pubdate = _start_published
def _end_published(self):
value = self.pop('published')
- self._save('published_parsed', _parse_date(value), overwrite=True)
+ self._save('published_parsed', _parse_date(value))
_end_dcterms_issued = _end_published
_end_issued = _end_published
- _end_pubdate = _end_published
def _start_updated(self, attrsD):
self.push('updated', 1)
_start_modified = _start_updated
_start_dcterms_modified = _start_updated
+ _start_pubdate = _start_updated
_start_dc_date = _start_updated
- _start_lastbuilddate = _start_updated
def _end_updated(self):
value = self.pop('updated')
parsed_value = _parse_date(value)
- self._save('updated_parsed', parsed_value, overwrite=True)
+ self._save('updated_parsed', parsed_value)
_end_modified = _end_updated
_end_dcterms_modified = _end_updated
+ _end_pubdate = _end_updated
_end_dc_date = _end_updated
- _end_lastbuilddate = _end_updated
def _start_created(self, attrsD):
self.push('created', 1)
@@ -1437,56 +1079,38 @@ class _FeedParserMixin:
def _end_created(self):
value = self.pop('created')
- self._save('created_parsed', _parse_date(value), overwrite=True)
+ self._save('created_parsed', _parse_date(value))
_end_dcterms_created = _end_created
def _start_expirationdate(self, attrsD):
self.push('expired', 1)
def _end_expirationdate(self):
- self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
+ self._save('expired_parsed', _parse_date(self.pop('expired')))
def _start_cc_license(self, attrsD):
- context = self._getContext()
+ self.push('license', 1)
value = self._getAttribute(attrsD, 'rdf:resource')
- attrsD = FeedParserDict()
- attrsD['rel'] = 'license'
if value:
- attrsD['href']=value
- context.setdefault('links', []).append(attrsD)
-
+ self.elementstack[-1][2].append(value)
+ self.pop('license')
+
def _start_creativecommons_license(self, attrsD):
self.push('license', 1)
- _start_creativeCommons_license = _start_creativecommons_license
def _end_creativecommons_license(self):
- value = self.pop('license')
- context = self._getContext()
- attrsD = FeedParserDict()
- attrsD['rel'] = 'license'
- if value:
- attrsD['href'] = value
- context.setdefault('links', []).append(attrsD)
- del context['license']
- _end_creativeCommons_license = _end_creativecommons_license
-
- def _addXFN(self, relationships, href, name):
- context = self._getContext()
- xfn = context.setdefault('xfn', [])
- value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
- if value not in xfn:
- xfn.append(value)
+ self.pop('license')
def _addTag(self, term, scheme, label):
context = self._getContext()
tags = context.setdefault('tags', [])
- if (not term) and (not scheme) and (not label):
- return
+ if (not term) and (not scheme) and (not label): return
value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
if value not in tags:
- tags.append(value)
+ tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
def _start_category(self, attrsD):
+ if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
term = attrsD.get('term')
scheme = attrsD.get('scheme', attrsD.get('domain'))
label = attrsD.get('label')
@@ -1494,24 +1118,18 @@ class _FeedParserMixin:
self.push('category', 1)
_start_dc_subject = _start_category
_start_keywords = _start_category
-
- def _start_media_category(self, attrsD):
- attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
- self._start_category(attrsD)
-
+
def _end_itunes_keywords(self):
- for term in self.pop('itunes_keywords').split(','):
- if term.strip():
- self._addTag(term.strip(), 'http://www.itunes.com/', None)
-
+ for term in self.pop('itunes_keywords').split():
+ self._addTag(term, 'http://www.itunes.com/', None)
+
def _start_itunes_category(self, attrsD):
self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
self.push('category', 1)
-
+
def _end_category(self):
value = self.pop('category')
- if not value:
- return
+ if not value: return
context = self._getContext()
tags = context['tags']
if value and len(tags) and not tags[-1]['term']:
@@ -1521,78 +1139,73 @@ class _FeedParserMixin:
_end_dc_subject = _end_category
_end_keywords = _end_category
_end_itunes_category = _end_category
- _end_media_category = _end_category
def _start_cloud(self, attrsD):
self._getContext()['cloud'] = FeedParserDict(attrsD)
-
+
def _start_link(self, attrsD):
attrsD.setdefault('rel', 'alternate')
- if attrsD['rel'] == 'self':
- attrsD.setdefault('type', 'application/atom+xml')
- else:
- attrsD.setdefault('type', 'text/html')
- context = self._getContext()
+ attrsD.setdefault('type', 'text/html')
attrsD = self._itsAnHrefDamnIt(attrsD)
- if 'href' in attrsD:
+ if attrsD.has_key('href'):
attrsD['href'] = self.resolveURI(attrsD['href'])
expectingText = self.infeed or self.inentry or self.insource
+ context = self._getContext()
context.setdefault('links', [])
- if not (self.inentry and self.inimage):
- context['links'].append(FeedParserDict(attrsD))
- if 'href' in attrsD:
+ context['links'].append(FeedParserDict(attrsD))
+ if attrsD['rel'] == 'enclosure':
+ self._start_enclosure(attrsD)
+ if attrsD.has_key('href'):
expectingText = 0
if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
context['link'] = attrsD['href']
else:
self.push('link', expectingText)
+ _start_producturl = _start_link
def _end_link(self):
value = self.pop('link')
+ context = self._getContext()
+ if self.intextinput:
+ context['textinput']['link'] = value
+ if self.inimage:
+ context['image']['link'] = value
+ _end_producturl = _end_link
def _start_guid(self, attrsD):
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
self.push('id', 1)
- _start_id = _start_guid
def _end_guid(self):
value = self.pop('id')
- self._save('guidislink', self.guidislink and 'link' not in self._getContext())
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
if self.guidislink:
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
# and only if the item doesn't already have a link element
self._save('link', value)
- _end_id = _end_guid
def _start_title(self, attrsD):
- if self.svgOK:
- return self.unknown_starttag('title', list(attrsD.items()))
self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
_start_dc_title = _start_title
_start_media_title = _start_title
def _end_title(self):
- if self.svgOK:
- return
value = self.popContent('title')
- if not value:
- return
- self.title_depth = self.depth
+ context = self._getContext()
+ if self.intextinput:
+ context['textinput']['title'] = value
+ elif self.inimage:
+ context['image']['title'] = value
_end_dc_title = _end_title
-
- def _end_media_title(self):
- title_depth = self.title_depth
- self._end_title()
- self.title_depth = title_depth
+ _end_media_title = _end_title
def _start_description(self, attrsD):
context = self._getContext()
- if 'summary' in context:
+ if context.has_key('summary'):
self._summaryKey = 'content'
self._start_content(attrsD)
else:
self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
- _start_dc_description = _start_description
def _start_abstract(self, attrsD):
self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
@@ -1602,9 +1215,13 @@ class _FeedParserMixin:
self._end_content()
else:
value = self.popContent('description')
+ context = self._getContext()
+ if self.intextinput:
+ context['textinput']['description'] = value
+ elif self.inimage:
+ context['image']['description'] = value
self._summaryKey = None
_end_abstract = _end_description
- _end_dc_description = _end_description
def _start_info(self, attrsD):
self.pushContent('info', attrsD, 'text/plain', 1)
@@ -1617,7 +1234,7 @@ class _FeedParserMixin:
def _start_generator(self, attrsD):
if attrsD:
attrsD = self._itsAnHrefDamnIt(attrsD)
- if 'href' in attrsD:
+ if attrsD.has_key('href'):
attrsD['href'] = self.resolveURI(attrsD['href'])
self._getContext()['generator_detail'] = FeedParserDict(attrsD)
self.push('generator', 1)
@@ -1625,9 +1242,9 @@ class _FeedParserMixin:
def _end_generator(self):
value = self.pop('generator')
context = self._getContext()
- if 'generator_detail' in context:
+ if context.has_key('generator_detail'):
context['generator_detail']['name'] = value
-
+
def _start_admin_generatoragent(self, attrsD):
self.push('generator', 1)
value = self._getAttribute(attrsD, 'rdf:resource')
@@ -1642,10 +1259,10 @@ class _FeedParserMixin:
if value:
self.elementstack[-1][2].append(value)
self.pop('errorreportsto')
-
+
def _start_summary(self, attrsD):
context = self._getContext()
- if 'summary' in context:
+ if context.has_key('summary'):
self._summaryKey = 'content'
self._start_content(attrsD)
else:
@@ -1660,26 +1277,21 @@ class _FeedParserMixin:
self.popContent(self._summaryKey or 'summary')
self._summaryKey = None
_end_itunes_summary = _end_summary
-
+
def _start_enclosure(self, attrsD):
attrsD = self._itsAnHrefDamnIt(attrsD)
- context = self._getContext()
- attrsD['rel'] = 'enclosure'
- context.setdefault('links', []).append(FeedParserDict(attrsD))
-
+ self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
+ href = attrsD.get('href')
+ if href:
+ context = self._getContext()
+ if not context.get('id'):
+ context['id'] = href
+
def _start_source(self, attrsD):
- if 'url' in attrsD:
- # This means that we're processing a source element from an RSS 2.0 feed
- self.sourcedata['href'] = attrsD['url']
- self.push('source', 1)
self.insource = 1
- self.title_depth = -1
def _end_source(self):
self.insource = 0
- value = self.pop('source')
- if value:
- self.sourcedata['title'] = value
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
self.sourcedata.clear()
@@ -1690,6 +1302,9 @@ class _FeedParserMixin:
self.contentparams['src'] = src
self.push('content', 1)
+ def _start_prodlink(self, attrsD):
+ self.pushContent('content', attrsD, 'text/html', 1)
+
def _start_body(self, attrsD):
self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
_start_xhtml_body = _start_body
@@ -1699,95 +1314,45 @@ class _FeedParserMixin:
_start_fullitem = _start_content_encoded
def _end_content(self):
- copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
+ copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
value = self.popContent('content')
- if copyToSummary:
- self._save('summary', value)
-
+ if copyToDescription:
+ self._save('description', value)
_end_body = _end_content
_end_xhtml_body = _end_content
_end_content_encoded = _end_content
_end_fullitem = _end_content
+ _end_prodlink = _end_content
def _start_itunes_image(self, attrsD):
self.push('itunes_image', 0)
- if attrsD.get('href'):
- self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
- elif attrsD.get('url'):
- self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
_start_itunes_link = _start_itunes_image
-
+
def _end_itunes_block(self):
value = self.pop('itunes_block', 0)
self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
def _end_itunes_explicit(self):
value = self.pop('itunes_explicit', 0)
- # Convert 'yes' -> True, 'clean' to False, and any other value to None
- # False and None both evaluate as False, so the difference can be ignored
- # by applications that only need to know if the content is explicit.
- self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
-
- def _start_media_content(self, attrsD):
- context = self._getContext()
- context.setdefault('media_content', [])
- context['media_content'].append(attrsD)
-
- def _start_media_thumbnail(self, attrsD):
- context = self._getContext()
- context.setdefault('media_thumbnail', [])
- self.push('url', 1) # new
- context['media_thumbnail'].append(attrsD)
-
- def _end_media_thumbnail(self):
- url = self.pop('url')
- context = self._getContext()
- if url != None and len(url.strip()) != 0:
- if 'url' not in context['media_thumbnail'][-1]:
- context['media_thumbnail'][-1]['url'] = url
-
- def _start_media_player(self, attrsD):
- self.push('media_player', 0)
- self._getContext()['media_player'] = FeedParserDict(attrsD)
-
- def _end_media_player(self):
- value = self.pop('media_player')
- context = self._getContext()
- context['media_player']['content'] = value
-
- def _start_newlocation(self, attrsD):
- self.push('newlocation', 1)
-
- def _end_newlocation(self):
- url = self.pop('newlocation')
- context = self._getContext()
- # don't set newlocation if the context isn't right
- if context is not self.feeddata:
- return
- context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
+ self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
if _XML_AVAILABLE:
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
def __init__(self, baseuri, baselang, encoding):
+ if _debug: sys.stderr.write('trying StrictFeedParser\n')
xml.sax.handler.ContentHandler.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
- self.decls = {}
-
+
def startPrefixMapping(self, prefix, uri):
- if not uri:
- return
- # Jython uses '' instead of None; standardize on None
- prefix = prefix or None
self.trackNamespace(prefix, uri)
- if prefix and uri == 'http://www.w3.org/1999/xlink':
- self.decls['xmlns:' + prefix] = uri
-
+
def startElementNS(self, name, qname, attrs):
namespace, localname = name
lowernamespace = str(namespace or '').lower()
- if lowernamespace.find('backend.userland.com/rss') != -1:
+ if lowernamespace.find('backend.userland.com/rss') <> -1:
# match any backend.userland.com namespace
namespace = 'http://backend.userland.com/rss'
lowernamespace = namespace
@@ -1796,9 +1361,12 @@ if _XML_AVAILABLE:
else:
givenprefix = None
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
- if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
- raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix)
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
+ if prefix:
+ localname = prefix + ':' + localname
localname = str(localname).lower()
+ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
# qname implementation is horribly broken in Python 2.1 (it
# doesn't report any), and slightly broken in Python 2.2 (it
@@ -1807,21 +1375,8 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
- attrsD, self.decls = self.decls, {}
- if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
- attrsD['xmlns']=namespace
- if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
- attrsD['xmlns']=namespace
-
- if prefix:
- localname = prefix.lower() + ':' + localname
- elif namespace and not qname: #Expat
- for name,value in list(self.namespacesInUse.items()):
- if name and value == namespace:
- localname = name + ':' + localname
- break
-
- for (namespace, attrlocalname), attrvalue in list(attrs.items()):
+ attrsD = {}
+ for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
lowernamespace = (namespace or '').lower()
prefix = self._matchnamespaces.get(lowernamespace, '')
if prefix:
@@ -1829,7 +1384,7 @@ if _XML_AVAILABLE:
attrsD[str(attrlocalname).lower()] = attrvalue
for qname in attrs.getQNames():
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
- self.unknown_starttag(localname, list(attrsD.items()))
+ self.unknown_starttag(localname, attrsD.items())
def characters(self, text):
self.handle_data(text)
@@ -1844,39 +1399,26 @@ if _XML_AVAILABLE:
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if prefix:
localname = prefix + ':' + localname
- elif namespace and not qname: #Expat
- for name,value in list(self.namespacesInUse.items()):
- if name and value == namespace:
- localname = name + ':' + localname
- break
localname = str(localname).lower()
self.unknown_endtag(localname)
def error(self, exc):
self.bozo = 1
self.exc = exc
-
- # drv_libxml2 calls warning() in some cases
- warning = error
-
+
def fatalError(self, exc):
self.error(exc)
raise exc
class _BaseHTMLProcessor(sgmllib.SGMLParser):
- special = re.compile('''[<>'"]''')
- bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
- elements_no_end_tag = set([
- 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
- 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
- 'source', 'track', 'wbr'
- ])
-
- def __init__(self, encoding, _type):
+ elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
+
+ def __init__(self, encoding):
self.encoding = encoding
- self._type = _type
+ if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
sgmllib.SGMLParser.__init__(self)
-
+
def reset(self):
self.pieces = []
sgmllib.SGMLParser.reset(self)
@@ -1887,132 +1429,80 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
return '<' + tag + ' />'
else:
return '<' + tag + '>' + tag + '>'
-
- # By declaring these methods and overriding their compiled code
- # with the code from sgmllib, the original code will execute in
- # feedparser's scope instead of sgmllib's. This means that the
- # `tagfind` and `charref` regular expressions will be found as
- # they're declared above, not as they're declared in sgmllib.
- def goahead(self, i):
- pass
- goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
-
- def __parse_starttag(self, i):
- pass
- __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
-
- def parse_starttag(self,i):
- j = self.__parse_starttag(i)
- if self._type == 'application/xhtml+xml':
- if j>2 and self.rawdata[j-2:j]=='/>':
- self.unknown_endtag(self.lasttag)
- return j
-
+
def feed(self, data):
data = re.compile(r'\s]+?)\s*/>', self._shorttag_replace, data)
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
+ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
- try:
- bytes
- if bytes is str:
- raise NameError
- self.encoding = self.encoding + '_INVALID_PYTHON_3'
- except NameError:
- if self.encoding and isinstance(data, str):
- data = data.encode(self.encoding)
+ if self.encoding and type(data) == type(u''):
+ data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
- sgmllib.SGMLParser.close(self)
def normalize_attrs(self, attrs):
- if not attrs:
- return attrs
# utility method to be called by descendants
- attrs = list(dict([(k.lower(), v) for k, v in attrs]).items())
+ attrs = [(k.lower(), v) for k, v in attrs]
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
- attrs.sort()
return attrs
def unknown_starttag(self, tag, attrs):
# called for each start tag
# attrs is a list of (attr, value) tuples
# e.g. for
, tag='pre', attrs=[('class', 'screen')]
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
uattrs = []
- strattrs=''
- if attrs:
- for key, value in attrs:
- value=value.replace('>','>').replace('<','<').replace('"','"')
- value = self.bare_ampersand.sub("&", value)
- # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
- if not isinstance(value, str):
- value = value.decode(self.encoding, 'ignore')
- try:
- # Currently, in Python 3 the key is already a str, and cannot be decoded again
- uattrs.append((str(key, self.encoding), value))
- except TypeError:
- uattrs.append((key, value))
- strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs])
- if self.encoding:
- try:
- strattrs = strattrs.encode(self.encoding)
- except (UnicodeEncodeError, LookupError):
- pass
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ for key, value in attrs:
+ if type(value) != type(u''):
+ value = unicode(value, self.encoding)
+ uattrs.append((unicode(key, self.encoding), value))
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
if tag in self.elements_no_end_tag:
- self.pieces.append('<%s%s />' % (tag, strattrs))
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
else:
- self.pieces.append('<%s%s>' % (tag, strattrs))
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
def unknown_endtag(self, tag):
# called for each end tag, e.g. for
, tag will be 'pre'
# Reconstruct the original end tag.
if tag not in self.elements_no_end_tag:
- self.pieces.append("%s>" % tag)
+ self.pieces.append("%(tag)s>" % locals())
def handle_charref(self, ref):
# called for each character reference, e.g. for ' ', ref will be '160'
# Reconstruct the original character reference.
- ref = ref.lower()
- if ref.startswith('x'):
- value = int(ref[1:], 16)
- else:
- value = int(ref)
-
- if value in _cp1252:
- self.pieces.append('%s;' % hex(ord(_cp1252[value]))[1:])
- else:
- self.pieces.append('%s;' % ref)
-
+ self.pieces.append('%(ref)s;' % locals())
+
def handle_entityref(self, ref):
# called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
- if ref in name2codepoint or ref == 'apos':
- self.pieces.append('&%s;' % ref)
- else:
- self.pieces.append('&%s' % ref)
+ self.pieces.append('&%(ref)s;' % locals())
def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
# Store the original text verbatim.
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
self.pieces.append(text)
-
+
def handle_comment(self, text):
# called for each HTML comment, e.g.
# Reconstruct the original comment.
- self.pieces.append('' % text)
-
+ self.pieces.append('' % locals())
+
def handle_pi(self, text):
# called for each processing instruction, e.g.
# Reconstruct original processing instruction.
- self.pieces.append('%s>' % text)
+ self.pieces.append('%(text)s>' % locals())
def handle_decl(self, text):
# called for the DOCTYPE, if present, e.g.
#
# Reconstruct original DOCTYPE
- self.pieces.append('' % text)
-
+ self.pieces.append('' % locals())
+
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
def _scan_name(self, i, declstartpos):
rawdata = self.rawdata
@@ -2031,497 +1521,36 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
# self.updatepos(declstartpos, i)
return None, -1
- def convert_charref(self, name):
- return '%s;' % name
-
- def convert_entityref(self, name):
- return '&%s;' % name
-
def output(self):
'''Return processed HTML as a single string'''
return ''.join([str(p) for p in self.pieces])
- def parse_declaration(self, i):
- try:
- return sgmllib.SGMLParser.parse_declaration(self, i)
- except sgmllib.SGMLParseError:
- # escape the doctype declaration and continue parsing
- self.handle_data('<')
- return i+1
-
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
- def __init__(self, baseuri, baselang, encoding, entities):
+ def __init__(self, baseuri, baselang, encoding):
sgmllib.SGMLParser.__init__(self)
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
- _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
- self.entities=entities
def decodeEntities(self, element, data):
data = data.replace('<', '<')
data = data.replace('<', '<')
- data = data.replace('<', '<')
data = data.replace('>', '>')
data = data.replace('>', '>')
- data = data.replace('>', '>')
data = data.replace('&', '&')
data = data.replace('&', '&')
data = data.replace('"', '"')
data = data.replace('"', '"')
data = data.replace(''', ''')
data = data.replace(''', ''')
- if not self.contentparams.get('type', 'xml').endswith('xml'):
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
data = data.replace('<', '<')
data = data.replace('>', '>')
data = data.replace('&', '&')
data = data.replace('"', '"')
data = data.replace(''', "'")
return data
-
- def strattrs(self, attrs):
- return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
-
-class _MicroformatsParser:
- STRING = 1
- DATE = 2
- URI = 3
- NODE = 4
- EMAIL = 5
-
- known_xfn_relationships = set(['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'])
- known_binary_extensions = set(['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'])
-
- def __init__(self, data, baseuri, encoding):
- self.document = BeautifulSoup.BeautifulSoup(data)
- self.baseuri = baseuri
- self.encoding = encoding
- if isinstance(data, str):
- data = data.encode(encoding)
- self.tags = []
- self.enclosures = []
- self.xfn = []
- self.vcard = None
-
- def vcardEscape(self, s):
- if isinstance(s, str):
- s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
- return s
-
- def vcardFold(self, s):
- s = re.sub(';+$', '', s)
- sFolded = ''
- iMax = 75
- sPrefix = ''
- while len(s) > iMax:
- sFolded += sPrefix + s[:iMax] + '\n'
- s = s[iMax:]
- sPrefix = ' '
- iMax = 74
- sFolded += sPrefix + s
- return sFolded
-
- def normalize(self, s):
- return re.sub(r'\s+', ' ', s).strip()
-
- def unique(self, aList):
- results = []
- for element in aList:
- if element not in results:
- results.append(element)
- return results
-
- def toISO8601(self, dt):
- return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
-
- def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
- all = lambda x: 1
- sProperty = sProperty.lower()
- bFound = 0
- bNormalize = 1
- propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
- if bAllowMultiple and (iPropertyType != self.NODE):
- snapResults = []
- containers = elmRoot(['ul', 'ol'], propertyMatch)
- for container in containers:
- snapResults.extend(container('li'))
- bFound = (len(snapResults) != 0)
- if not bFound:
- snapResults = elmRoot(all, propertyMatch)
- bFound = (len(snapResults) != 0)
- if (not bFound) and (sProperty == 'value'):
- snapResults = elmRoot('pre')
- bFound = (len(snapResults) != 0)
- bNormalize = not bFound
- if not bFound:
- snapResults = [elmRoot]
- bFound = (len(snapResults) != 0)
- arFilter = []
- if sProperty == 'vcard':
- snapFilter = elmRoot(all, propertyMatch)
- for node in snapFilter:
- if node.findParent(all, propertyMatch):
- arFilter.append(node)
- arResults = []
- for node in snapResults:
- if node not in arFilter:
- arResults.append(node)
- bFound = (len(arResults) != 0)
- if not bFound:
- if bAllowMultiple:
- return []
- elif iPropertyType == self.STRING:
- return ''
- elif iPropertyType == self.DATE:
- return None
- elif iPropertyType == self.URI:
- return ''
- elif iPropertyType == self.NODE:
- return None
- else:
- return None
- arValues = []
- for elmResult in arResults:
- sValue = None
- if iPropertyType == self.NODE:
- if bAllowMultiple:
- arValues.append(elmResult)
- continue
- else:
- return elmResult
- sNodeName = elmResult.name.lower()
- if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
- sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
- if sValue:
- sValue = bNormalize and self.normalize(sValue) or sValue.strip()
- if (not sValue) and (sNodeName == 'abbr'):
- sValue = elmResult.get('title')
- if sValue:
- sValue = bNormalize and self.normalize(sValue) or sValue.strip()
- if (not sValue) and (iPropertyType == self.URI):
- if sNodeName == 'a':
- sValue = elmResult.get('href')
- elif sNodeName == 'img':
- sValue = elmResult.get('src')
- elif sNodeName == 'object':
- sValue = elmResult.get('data')
- if sValue:
- sValue = bNormalize and self.normalize(sValue) or sValue.strip()
- if (not sValue) and (sNodeName == 'img'):
- sValue = elmResult.get('alt')
- if sValue:
- sValue = bNormalize and self.normalize(sValue) or sValue.strip()
- if not sValue:
- sValue = elmResult.renderContents()
- sValue = re.sub(r'<\S[^>]*>', '', sValue)
- sValue = sValue.replace('\r\n', '\n')
- sValue = sValue.replace('\r', '\n')
- if sValue:
- sValue = bNormalize and self.normalize(sValue) or sValue.strip()
- if not sValue:
- continue
- if iPropertyType == self.DATE:
- sValue = _parse_date_iso8601(sValue)
- if bAllowMultiple:
- arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
- else:
- return bAutoEscape and self.vcardEscape(sValue) or sValue
- return arValues
-
- def findVCards(self, elmRoot, bAgentParsing=0):
- sVCards = ''
-
- if not bAgentParsing:
- arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
- else:
- arCards = [elmRoot]
-
- for elmCard in arCards:
- arLines = []
-
- def processSingleString(sProperty):
- sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
- if sValue:
- arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
- return sValue or ''
-
- def processSingleURI(sProperty):
- sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
- if sValue:
- sContentType = ''
- sEncoding = ''
- sValueKey = ''
- if sValue.startswith('data:'):
- sEncoding = ';ENCODING=b'
- sContentType = sValue.split(';')[0].split('/').pop()
- sValue = sValue.split(',', 1).pop()
- else:
- elmValue = self.getPropertyValue(elmCard, sProperty)
- if elmValue:
- if sProperty != 'url':
- sValueKey = ';VALUE=uri'
- sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
- sContentType = sContentType.upper()
- if sContentType == 'OCTET-STREAM':
- sContentType = ''
- if sContentType:
- sContentType = ';TYPE=' + sContentType.upper()
- arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
-
- def processTypeValue(sProperty, arDefaultType, arForceType=None):
- arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
- for elmResult in arResults:
- arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
- if arForceType:
- arType = self.unique(arForceType + arType)
- if not arType:
- arType = arDefaultType
- sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
- if sValue:
- arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
-
- # AGENT
- # must do this before all other properties because it is destructive
- # (removes nested class="vcard" nodes so they don't interfere with
- # this vcard's other properties)
- arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
- for elmAgent in arAgent:
- if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
- sAgentValue = self.findVCards(elmAgent, 1) + '\n'
- sAgentValue = sAgentValue.replace('\n', '\\n')
- sAgentValue = sAgentValue.replace(';', '\\;')
- if sAgentValue:
- arLines.append(self.vcardFold('AGENT:' + sAgentValue))
- # Completely remove the agent element from the parse tree
- elmAgent.extract()
- else:
- sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
- if sAgentValue:
- arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
-
- # FN (full name)
- sFN = processSingleString('fn')
-
- # N (name)
- elmName = self.getPropertyValue(elmCard, 'n')
- if elmName:
- sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
- sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
- arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
- arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
- arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
- arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
- sGivenName + ';' +
- ','.join(arAdditionalNames) + ';' +
- ','.join(arHonorificPrefixes) + ';' +
- ','.join(arHonorificSuffixes)))
- elif sFN:
- # implied "N" optimization
- # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
- arNames = self.normalize(sFN).split()
- if len(arNames) == 2:
- bFamilyNameFirst = (arNames[0].endswith(',') or
- len(arNames[1]) == 1 or
- ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
- if bFamilyNameFirst:
- arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
- else:
- arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
-
- # SORT-STRING
- sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
- if sSortString:
- arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
-
- # NICKNAME
- arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
- if arNickname:
- arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
-
- # PHOTO
- processSingleURI('photo')
-
- # BDAY
- dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
- if dtBday:
- arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
-
- # ADR (address)
- arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
- for elmAdr in arAdr:
- arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
- if not arType:
- arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
- sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
- sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
- sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
- sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
- sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
- sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
- sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
- arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
- sPostOfficeBox + ';' +
- sExtendedAddress + ';' +
- sStreetAddress + ';' +
- sLocality + ';' +
- sRegion + ';' +
- sPostalCode + ';' +
- sCountryName))
-
- # LABEL
- processTypeValue('label', ['intl','postal','parcel','work'])
-
- # TEL (phone number)
- processTypeValue('tel', ['voice'])
-
- # EMAIL
- processTypeValue('email', ['internet'], ['internet'])
-
- # MAILER
- processSingleString('mailer')
-
- # TZ (timezone)
- processSingleString('tz')
-
- # GEO (geographical information)
- elmGeo = self.getPropertyValue(elmCard, 'geo')
- if elmGeo:
- sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
- sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
- arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
-
- # TITLE
- processSingleString('title')
-
- # ROLE
- processSingleString('role')
-
- # LOGO
- processSingleURI('logo')
-
- # ORG (organization)
- elmOrg = self.getPropertyValue(elmCard, 'org')
- if elmOrg:
- sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
- if not sOrganizationName:
- # implied "organization-name" optimization
- # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
- sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
- if sOrganizationName:
- arLines.append(self.vcardFold('ORG:' + sOrganizationName))
- else:
- arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
- arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
-
- # CATEGORY
- arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
- if arCategory:
- arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
-
- # NOTE
- processSingleString('note')
-
- # REV
- processSingleString('rev')
-
- # SOUND
- processSingleURI('sound')
-
- # UID
- processSingleString('uid')
-
- # URL
- processSingleURI('url')
-
- # CLASS
- processSingleString('class')
-
- # KEY
- processSingleURI('key')
-
- if arLines:
- arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard']
- # XXX - this is super ugly; properly fix this with issue 148
- for i, s in enumerate(arLines):
- if not isinstance(s, str):
- arLines[i] = s.decode('utf-8', 'ignore')
- sVCards += '\n'.join(arLines) + '\n'
-
- return sVCards.strip()
-
- def isProbablyDownloadable(self, elm):
- attrsD = elm.attrMap
- if 'href' not in attrsD:
- return 0
- linktype = attrsD.get('type', '').strip()
- if linktype.startswith('audio/') or \
- linktype.startswith('video/') or \
- (linktype.startswith('application/') and not linktype.endswith('xml')):
- return 1
- try:
- path = urllib.parse.urlparse(attrsD['href'])[2]
- except ValueError:
- return 0
- if path.find('.') == -1:
- return 0
- fileext = path.split('.').pop().lower()
- return fileext in self.known_binary_extensions
-
- def findTags(self):
- all = lambda x: 1
- for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
- href = elm.get('href')
- if not href:
- continue
- urlscheme, domain, path, params, query, fragment = \
- urllib.parse.urlparse(_urljoin(self.baseuri, href))
- segments = path.split('/')
- tag = segments.pop()
- if not tag:
- if segments:
- tag = segments.pop()
- else:
- # there are no tags
- continue
- tagscheme = urllib.parse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
- if not tagscheme.endswith('/'):
- tagscheme += '/'
- self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
-
- def findEnclosures(self):
- all = lambda x: 1
- enclosure_match = re.compile(r'\benclosure\b')
- for elm in self.document(all, {'href': re.compile(r'.+')}):
- if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm):
- continue
- if elm.attrMap not in self.enclosures:
- self.enclosures.append(elm.attrMap)
- if elm.string and not elm.get('title'):
- self.enclosures[-1]['title'] = elm.string
-
- def findXFN(self):
- all = lambda x: 1
- for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
- rels = elm.get('rel', '').split()
- xfn_rels = [r for r in rels if r in self.known_xfn_relationships]
- if xfn_rels:
- self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
-
-def _parseMicroformats(htmlSource, baseURI, encoding):
- if not BeautifulSoup:
- return
- try:
- p = _MicroformatsParser(htmlSource, baseURI, encoding)
- except UnicodeEncodeError:
- # sgmllib throws this exception when performing lookups of tags
- # with non-ASCII characters in them.
- return
- p.vcard = p.findVCards(p.document)
- p.findTags()
- p.findEnclosures()
- p.findXFN()
- return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
-
+
class _RelativeURIResolver(_BaseHTMLProcessor):
- relative_uris = set([('a', 'href'),
+ relative_uris = [('a', 'href'),
('applet', 'codebase'),
('area', 'href'),
('blockquote', 'cite'),
@@ -2545,259 +1574,67 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
('object', 'data'),
('object', 'usemap'),
('q', 'cite'),
- ('script', 'src'),
- ('video', 'poster')])
+ ('script', 'src')]
- def __init__(self, baseuri, encoding, _type):
- _BaseHTMLProcessor.__init__(self, encoding, _type)
+ def __init__(self, baseuri, encoding):
+ _BaseHTMLProcessor.__init__(self, encoding)
self.baseuri = baseuri
def resolveURI(self, uri):
- return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
-
+ return _urljoin(self.baseuri, uri)
+
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-
-def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
- if not _SGML_AVAILABLE:
- return htmlSource
-
- p = _RelativeURIResolver(baseURI, encoding, _type)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding):
+ if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+ p = _RelativeURIResolver(baseURI, encoding)
p.feed(htmlSource)
return p.output()
-def _makeSafeAbsoluteURI(base, rel=None):
- # bail if ACCEPTABLE_URI_SCHEMES is empty
- if not ACCEPTABLE_URI_SCHEMES:
- try:
- return _urljoin(base, rel or '')
- except ValueError:
- return ''
- if not base:
- return rel or ''
- if not rel:
- try:
- scheme = urllib.parse.urlparse(base)[0]
- except ValueError:
- return ''
- if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
- return base
- return ''
- try:
- uri = _urljoin(base, rel)
- except ValueError:
- return ''
- if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
- return ''
- return uri
-
class _HTMLSanitizer(_BaseHTMLProcessor):
- acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
- 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
- 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
- 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
- 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
- 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
- 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
- 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
- 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
- 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
- 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
- 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
+ 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
+ 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
+ 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
+ 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
+ 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
+ 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
+ 'thead', 'tr', 'tt', 'u', 'ul', 'var']
- acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
- 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
- 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
- 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
- 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
- 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
- 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
- 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
- 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
- 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
- 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
- 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
- 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
- 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
- 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
- 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
- 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
- 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
- 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
- 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
- 'width', 'wrap', 'xml:lang'])
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
+ 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
+ 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
+ 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
+ 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
+ 'usemap', 'valign', 'value', 'vspace', 'width']
- unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
-
- acceptable_css_properties = set(['azimuth', 'background-color',
- 'border-bottom-color', 'border-collapse', 'border-color',
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
- 'white-space', 'width'])
-
- # survey of common keywords found in feeds
- acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
- 'transparent', 'underline', 'white', 'yellow'])
-
- valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
- '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
-
- mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math',
- 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
- 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
- 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
- 'munderover', 'none', 'semantics'])
-
- mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign',
- 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
- 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
- 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
- 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
- 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
- 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
- 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
- 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'])
-
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
- svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
- 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
- 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
- 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
- 'svg', 'switch', 'text', 'title', 'tspan', 'use'])
-
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
- svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
- 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
- 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
- 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
- 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
- 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
- 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
- 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
- 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
- 'overline-position', 'overline-thickness', 'panose-1', 'path',
- 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
- 'stop-color', 'stop-opacity', 'strikethrough-position',
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
- 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
- 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
- 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
- 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
- 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
- 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
- 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
- 'y2', 'zoomAndPan'])
-
- svg_attr_map = None
- svg_elem_map = None
-
- acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
- 'stroke-opacity'])
+ unacceptable_elements_with_end_tag = ['script', 'applet']
def reset(self):
_BaseHTMLProcessor.reset(self)
self.unacceptablestack = 0
- self.mathmlOK = 0
- self.svgOK = 0
-
+
def unknown_starttag(self, tag, attrs):
- acceptable_attributes = self.acceptable_attributes
- keymap = {}
- if not tag in self.acceptable_elements or self.svgOK:
+ if not tag in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack += 1
-
- # add implicit namespaces to html5 inline svg/mathml
- if self._type.endswith('html'):
- if not dict(attrs).get('xmlns'):
- if tag=='svg':
- attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
- if tag=='math':
- attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
-
- # not otherwise acceptable, perhaps it is MathML or SVG?
- if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
- self.mathmlOK += 1
- if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
- self.svgOK += 1
-
- # chose acceptable attributes based on tag class, else bail
- if self.mathmlOK and tag in self.mathml_elements:
- acceptable_attributes = self.mathml_attributes
- elif self.svgOK and tag in self.svg_elements:
- # for most vocabularies, lowercasing is a good idea. Many
- # svg elements, however, are camel case
- if not self.svg_attr_map:
- lower=[attr.lower() for attr in self.svg_attributes]
- mix=[a for a in self.svg_attributes if a not in lower]
- self.svg_attributes = lower
- self.svg_attr_map = dict([(a.lower(),a) for a in mix])
-
- lower=[attr.lower() for attr in self.svg_elements]
- mix=[a for a in self.svg_elements if a not in lower]
- self.svg_elements = lower
- self.svg_elem_map = dict([(a.lower(),a) for a in mix])
- acceptable_attributes = self.svg_attributes
- tag = self.svg_elem_map.get(tag,tag)
- keymap = self.svg_attr_map
- elif not tag in self.acceptable_elements:
- return
-
- # declare xlink namespace, if needed
- if self.mathmlOK or self.svgOK:
- if [n_v for n_v in attrs if n_v[0].startswith('xlink:')]:
- if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
- attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
-
- clean_attrs = []
- for key, value in self.normalize_attrs(attrs):
- if key in acceptable_attributes:
- key=keymap.get(key,key)
- # make sure the uri uses an acceptable uri scheme
- if key == 'href':
- value = _makeSafeAbsoluteURI(value)
- clean_attrs.append((key,value))
- elif key=='style':
- clean_value = self.sanitize_style(value)
- if clean_value:
- clean_attrs.append((key,clean_value))
- _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
-
+ return
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
def unknown_endtag(self, tag):
if not tag in self.acceptable_elements:
if tag in self.unacceptable_elements_with_end_tag:
self.unacceptablestack -= 1
- if self.mathmlOK and tag in self.mathml_elements:
- if tag == 'math' and self.mathmlOK:
- self.mathmlOK -= 1
- elif self.svgOK and tag in self.svg_elements:
- tag = self.svg_elem_map.get(tag,tag)
- if tag == 'svg' and self.svgOK:
- self.svgOK -= 1
- else:
- return
+ return
_BaseHTMLProcessor.unknown_endtag(self, tag)
def handle_pi(self, text):
@@ -2810,53 +1647,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
if not self.unacceptablestack:
_BaseHTMLProcessor.handle_data(self, text)
- def sanitize_style(self, style):
- # disallow urls
- style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
-
- # gauntlet
- if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
- return ''
- # This replaced a regexp that used re.match and was prone to pathological back-tracking.
- if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
- return ''
-
- clean = []
- for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
- if not value:
- continue
- if prop.lower() in self.acceptable_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
- for keyword in value.split():
- if not keyword in self.acceptable_css_keywords and \
- not self.valid_css_values.match(keyword):
- break
- else:
- clean.append(prop + ': ' + value + ';')
- elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
-
- def parse_comment(self, i, report=1):
- ret = _BaseHTMLProcessor.parse_comment(self, i, report)
- if ret >= 0:
- return ret
- # if ret == -1, this may be a malicious attempt to circumvent
- # sanitization, or a page-destroying unclosed comment
- match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
- if match:
- return match.end()
- # unclosed comment; deliberately fail to handle_data()
- return len(self.rawdata)
-
-
-def _sanitizeHTML(htmlSource, encoding, _type):
- if not _SGML_AVAILABLE:
- return htmlSource
- p = _HTMLSanitizer(encoding, _type)
- htmlSource = htmlSource.replace(''):
@@ -2894,50 +1686,61 @@ def _sanitizeHTML(htmlSource, encoding, _type):
data = data.strip().replace('\r\n', '\n')
return data
-class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers):
- # The default implementation just raises HTTPError.
- # Forget that.
- fp.status = code
- return fp
+ if ((code / 100) == 3) and (code != 304):
+ return self.http_error_302(req, fp, code, msg, headers)
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
+ infourl.status = code
+ return infourl
- def http_error_301(self, req, fp, code, msg, hdrs):
- result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp,
- code, msg, hdrs)
- result.status = code
- result.newurl = result.geturl()
- return result
- # The default implementations in urllib2.HTTPRedirectHandler
- # are identical, so hardcoding a http_error_301 call above
- # won't affect anything
- http_error_300 = http_error_301
- http_error_302 = http_error_301
- http_error_303 = http_error_301
- http_error_307 = http_error_301
+ def http_error_302(self, req, fp, code, msg, headers):
+ if headers.dict.has_key('location'):
+ infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
+ else:
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
+ if not hasattr(infourl, 'status'):
+ infourl.status = code
+ return infourl
+ def http_error_301(self, req, fp, code, msg, headers):
+ if headers.dict.has_key('location'):
+ infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
+ else:
+ infourl = urllib.addinfourl(fp, headers, req.get_full_url())
+ if not hasattr(infourl, 'status'):
+ infourl.status = code
+ return infourl
+
+ http_error_300 = http_error_302
+ http_error_303 = http_error_302
+ http_error_307 = http_error_302
+
def http_error_401(self, req, fp, code, msg, headers):
# Check if
# - server requires digest auth, AND
# - we tried (unsuccessfully) with basic auth, AND
+ # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
# If all conditions hold, parse authentication information
# out of the Authorization header we sent the first time
# (for the username and password) and the WWW-Authenticate
# header the server sent back (for the realm) and retry
# the request with the appropriate digest auth headers instead.
# This evil genius hack has been brought to you by Aaron Swartz.
- host = urllib.parse.urlparse(req.get_full_url())[1]
- if base64 is None or 'Authorization' not in req.headers \
- or 'WWW-Authenticate' not in headers:
+ host = urlparse.urlparse(req.get_full_url())[1]
+ try:
+ assert sys.version.split()[0] >= '2.3.3'
+ assert base64 != None
+ user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+ self.add_password(realm, host, user, passw)
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+ self.reset_retry_count()
+ return retry
+ except:
return self.http_error_default(req, fp, code, msg, headers)
- auth = _base64decode(req.headers['Authorization'].split(' ')[1])
- user, passw = auth.split(':')
- realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
- self.add_password(realm, host, user, passw)
- retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
- self.reset_retry_count()
- return retry
-def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
"""URL, filename, or string --> stream
This function lets you define parsers that take any input source
@@ -2949,12 +1752,10 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
If the etag argument is supplied, it will be used as the value of an
If-None-Match request header.
- If the modified argument is supplied, it can be a tuple of 9 integers
- (as returned by gmtime() in the standard Python time module) or a date
- string in any format supported by feedparser. Regardless, it MUST
- be in GMT (Greenwich Mean Time). It will be reformatted into an
- RFC 1123-compliant date and used as the value of an If-Modified-Since
- request header.
+ If the modified argument is supplied, it must be a tuple of 9 integers
+ as returned by gmtime() in the standard Python time module. This MUST
+ be in GMT (Greenwich Mean Time). The formatted date/time will be used
+ as the value of an If-Modified-Since request header.
If the agent argument is supplied, it will be used as the value of a
User-Agent request header.
@@ -2964,132 +1765,76 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
If handlers is supplied, it is a list of handlers used to build a
urllib2 opener.
-
- if request_headers is supplied it is a dictionary of HTTP request headers
- that will override the values generated by FeedParser.
"""
if hasattr(url_file_stream_or_string, 'read'):
return url_file_stream_or_string
- if isinstance(url_file_stream_or_string, str) \
- and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
- # Deal with the feed URI scheme
- if url_file_stream_or_string.startswith('feed:http'):
- url_file_stream_or_string = url_file_stream_or_string[5:]
- elif url_file_stream_or_string.startswith('feed:'):
- url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
+ if url_file_stream_or_string == '-':
+ return sys.stdin
+
+ if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
if not agent:
agent = USER_AGENT
- # Test for inline user:password credentials for HTTP basic auth
+ # test for inline user:password for basic auth
auth = None
- if base64 and not url_file_stream_or_string.startswith('ftp:'):
- urltype, rest = urllib.parse.splittype(url_file_stream_or_string)
- realhost, rest = urllib.parse.splithost(rest)
+ if base64:
+ urltype, rest = urllib.splittype(url_file_stream_or_string)
+ realhost, rest = urllib.splithost(rest)
if realhost:
- user_passwd, realhost = urllib.parse.splituser(realhost)
+ user_passwd, realhost = urllib.splituser(realhost)
if user_passwd:
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
- auth = base64.standard_b64encode(user_passwd).strip()
-
- # iri support
- if isinstance(url_file_stream_or_string, str):
- url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
-
+ auth = base64.encodestring(user_passwd).strip()
# try to open with urllib2 (to use optional headers)
- request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
- opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
+ request = urllib2.Request(url_file_stream_or_string)
+ request.add_header('User-Agent', agent)
+ if etag:
+ request.add_header('If-None-Match', etag)
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header('Referer', referrer)
+ if gzip and zlib:
+ request.add_header('Accept-encoding', 'gzip, deflate')
+ elif gzip:
+ request.add_header('Accept-encoding', 'gzip')
+ elif zlib:
+ request.add_header('Accept-encoding', 'deflate')
+ else:
+ request.add_header('Accept-encoding', '')
+ if auth:
+ request.add_header('Authorization', 'Basic %s' % auth)
+ if ACCEPT_HEADER:
+ request.add_header('Accept', ACCEPT_HEADER)
+ request.add_header('A-IM', 'feed') # RFC 3229 support
+ opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
try:
return opener.open(request)
finally:
opener.close() # JohnD
-
+
# try to open with native open function (if url_file_stream_or_string is a filename)
try:
- return open(url_file_stream_or_string, 'rb')
- except (IOError, UnicodeEncodeError, TypeError):
- # if url_file_stream_or_string is a unicode object that
- # cannot be converted to the encoding returned by
- # sys.getfilesystemencoding(), a UnicodeEncodeError
- # will be thrown
- # If url_file_stream_or_string is a string that contains NULL
- # (such as an XML document encoded in UTF-32), TypeError will
- # be thrown.
+ return open(url_file_stream_or_string)
+ except:
pass
# treat url_file_stream_or_string as string
- if isinstance(url_file_stream_or_string, str):
- return _StringIO(url_file_stream_or_string.encode('utf-8'))
- return _StringIO(url_file_stream_or_string)
-
-def _convert_to_idn(url):
- """Convert a URL to IDN notation"""
- # this function should only be called with a unicode string
- # strategy: if the host cannot be encoded in ascii, then
- # it'll be necessary to encode it in idn form
- parts = list(urllib.parse.urlsplit(url))
- try:
- parts[1].encode('ascii')
- except UnicodeEncodeError:
- # the url needs to be converted to idn notation
- host = parts[1].rsplit(':', 1)
- newhost = []
- port = ''
- if len(host) == 2:
- port = host.pop()
- for h in host[0].split('.'):
- newhost.append(h.encode('idna').decode('utf-8'))
- parts[1] = '.'.join(newhost)
- if port:
- parts[1] += ':' + port
- return urllib.parse.urlunsplit(parts)
- else:
- return url
-
-def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
- request = urllib.request.Request(url)
- request.add_header('User-Agent', agent)
- if etag:
- request.add_header('If-None-Match', etag)
- if isinstance(modified, str):
- modified = _parse_date(modified)
- elif isinstance(modified, datetime.datetime):
- modified = modified.utctimetuple()
- if modified:
- # format into an RFC 1123-compliant timestamp. We can't use
- # time.strftime() since the %a and %b directives can be affected
- # by the current locale, but RFC 2616 states that dates must be
- # in English.
- short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
- request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
- if referrer:
- request.add_header('Referer', referrer)
- if gzip and zlib:
- request.add_header('Accept-encoding', 'gzip, deflate')
- elif gzip:
- request.add_header('Accept-encoding', 'gzip')
- elif zlib:
- request.add_header('Accept-encoding', 'deflate')
- else:
- request.add_header('Accept-encoding', '')
- if auth:
- request.add_header('Authorization', 'Basic %s' % auth)
- if ACCEPT_HEADER:
- request.add_header('Accept', ACCEPT_HEADER)
- # use this for whatever -- cookies, special headers, etc
- # [('Cookie','Something'),('x-special-header','Another Value')]
- for header_name, header_value in list(request_headers.items()):
- request.add_header(header_name, header_value)
- request.add_header('A-IM', 'feed') # RFC 3229 support
- return request
+ return _StringIO(str(url_file_stream_or_string))
_date_handlers = []
def registerDateHandler(func):
'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
_date_handlers.insert(0, func)
-
+
# ISO-8601 date parsing routines written by Fazal Majid.
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
# parser is beyond the scope of feedparser and would be a worthwhile addition
@@ -3099,8 +1844,8 @@ def registerDateHandler(func):
# 0301-04-01), so we use templates instead.
# Please note the order in templates is significant because we need a
# greedy match.
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
- 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
'-YY-?MM', '-OOO', '-YY',
'--MM-?DD', '--MM',
'---DD',
@@ -3115,29 +1860,19 @@ _iso8601_re = [
'CC', r'(?P
\d\d$)')
+ r'(T?(?P\d{2}):(?P\d{2})'
+ r'(:(?P\d{2}))?'
- + r'(\.(?P\d+))?'
+ r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?'
for tmpl in _iso8601_tmpl]
-try:
- del tmpl
-except NameError:
- pass
+del tmpl
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
-try:
- del regex
-except NameError:
- pass
+del regex
def _parse_date_iso8601(dateString):
'''Parse a variety of ISO-8601-compatible formats like 20040105'''
m = None
for _iso8601_match in _iso8601_matches:
m = _iso8601_match(dateString)
- if m:
- break
- if not m:
- return
- if m.span() == (0, 0):
- return
+ if m: break
+ if not m: return
+ if m.span() == (0, 0): return
params = m.groupdict()
ordinal = params.get('ordinal', 0)
if ordinal:
@@ -3175,7 +1910,7 @@ def _parse_date_iso8601(dateString):
day = int(day)
# special case of the century - is the first year of the 21st century
# 2000 or 2001 ? The debate goes on...
- if 'century' in params:
+ if 'century' in params.keys():
year = (int(params['century']) - 1) * 100 + 1
# in ISO 8601 most fields are optional
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
@@ -3183,10 +1918,14 @@ def _parse_date_iso8601(dateString):
params[field] = 0
hour = int(params.get('hour', 0))
minute = int(params.get('minute', 0))
- second = int(float(params.get('second', 0)))
+ second = int(params.get('second', 0))
# weekday is normalized by mktime(), we can ignore it
weekday = 0
- daylight_savings_flag = -1
+ # daylight savings is complex, but not needed for feedparser's purposes
+ # as time zones, if specified, include mention of whether it is active
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
+ # and most implementations have DST bugs
+ daylight_savings_flag = 0
tm = [year, month, day, hour, minute, second, weekday,
ordinal, daylight_savings_flag]
# ISO 8601 time zone adjustments
@@ -3203,39 +1942,38 @@ def _parse_date_iso8601(dateString):
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
# which is guaranteed to normalize d/m/y/h/m/s.
# Many implementations have bugs, but we'll pretend they don't.
- return time.localtime(time.mktime(tuple(tm)))
+ return time.localtime(time.mktime(tm))
registerDateHandler(_parse_date_iso8601)
-
+
# 8-bit date handling routines written by ytrewq1.
-_korean_year = '\ub144' # b3e2 in euc-kr
-_korean_month = '\uc6d4' # bff9 in euc-kr
-_korean_day = '\uc77c' # c0cf in euc-kr
-_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr
-_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr
+_korean_year = u'\ub144' # b3e2 in euc-kr
+_korean_month = u'\uc6d4' # bff9 in euc-kr
+_korean_day = u'\uc77c' # c0cf in euc-kr
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
_korean_onblog_date_re = \
re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
(_korean_year, _korean_month, _korean_day))
_korean_nate_date_re = \
- re.compile('(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
(_korean_am, _korean_pm))
def _parse_date_onblog(dateString):
'''Parse a string according to the OnBlog 8-bit date format'''
m = _korean_onblog_date_re.match(dateString)
- if not m:
- return
+ if not m: return
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
'zonediff': '+09:00'}
+ if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_onblog)
def _parse_date_nate(dateString):
'''Parse a string according to the Nate 8-bit date format'''
m = _korean_nate_date_re.match(dateString)
- if not m:
- return
+ if not m: return
hour = int(m.group(5))
ampm = m.group(4)
if (ampm == _korean_pm):
@@ -3247,97 +1985,118 @@ def _parse_date_nate(dateString):
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
'zonediff': '+09:00'}
+ if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_nate)
+_mssql_date_re = \
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
+def _parse_date_mssql(dateString):
+ '''Parse a string according to the MS SQL date format'''
+ m = _mssql_date_re.match(dateString)
+ if not m: return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_mssql)
+
# Unicode strings for Greek date strings
_greek_months = \
{ \
- '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7
- '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7
- '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7
- '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7
- '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7
- '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7
- '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7
- '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7
- '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7
- '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7
- '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7
- '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7
- '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7
- '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7
- '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7
- '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7
- '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7
- '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7
- '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
}
_greek_wdays = \
{ \
- '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7
- '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7
- '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7
- '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7
- '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7
- '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7
- '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
}
_greek_date_format_re = \
- re.compile('([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
def _parse_date_greek(dateString):
'''Parse a string according to a Greek 8-bit date format.'''
m = _greek_date_format_re.match(dateString)
- if not m:
+ if not m: return
+ try:
+ wday = _greek_wdays[m.group(1)]
+ month = _greek_months[m.group(3)]
+ except:
return
- wday = _greek_wdays[m.group(1)]
- month = _greek_months[m.group(3)]
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
'zonediff': m.group(8)}
+ if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
return _parse_date_rfc822(rfc822date)
registerDateHandler(_parse_date_greek)
# Unicode strings for Hungarian date strings
_hungarian_months = \
{ \
- 'janu\u00e1r': '01', # e1 in iso-8859-2
- 'febru\u00e1ri': '02', # e1 in iso-8859-2
- 'm\u00e1rcius': '03', # e1 in iso-8859-2
- '\u00e1prilis': '04', # e1 in iso-8859-2
- 'm\u00e1ujus': '05', # e1 in iso-8859-2
- 'j\u00fanius': '06', # fa in iso-8859-2
- 'j\u00falius': '07', # fa in iso-8859-2
- 'augusztus': '08',
- 'szeptember': '09',
- 'okt\u00f3ber': '10', # f3 in iso-8859-2
- 'november': '11',
- 'december': '12',
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2
+ u'j\u00fanius': u'06', # fa in iso-8859-2
+ u'j\u00falius': u'07', # fa in iso-8859-2
+ u'augusztus': u'08',
+ u'szeptember': u'09',
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2
+ u'november': u'11',
+ u'december': u'12',
}
_hungarian_date_format_re = \
- re.compile('(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
def _parse_date_hungarian(dateString):
'''Parse a string according to a Hungarian 8-bit date format.'''
m = _hungarian_date_format_re.match(dateString)
- if not m or m.group(2) not in _hungarian_months:
- return None
- month = _hungarian_months[m.group(2)]
- day = m.group(3)
- if len(day) == 1:
- day = '0' + day
- hour = m.group(4)
- if len(hour) == 1:
- hour = '0' + hour
+ if not m: return
+ try:
+ month = _hungarian_months[m.group(2)]
+ day = m.group(3)
+ if len(day) == 1:
+ day = '0' + day
+ hour = m.group(4)
+ if len(hour) == 1:
+ hour = '0' + hour
+ except:
+ return
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
{'year': m.group(1), 'month': month, 'day': day,\
'hour': hour, 'minute': m.group(5),\
'zonediff': m.group(6)}
+ if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
return _parse_date_w3dtf(w3dtfdate)
registerDateHandler(_parse_date_hungarian)
@@ -3345,9 +2104,6 @@ registerDateHandler(_parse_date_hungarian)
# Drake and licensed under the Python license. Removed all range checking
# for month, day, hour, minute, and second, since mktime will normalize
# these later
-# Modified to also support MSSQL-style datetimes as defined at:
-# http://msdn.microsoft.com/en-us/library/ms186724.aspx
-# (which basically means allowing a space as a date/time/timezone separator)
def _parse_date_w3dtf(dateString):
def __extract_date(m):
year = int(m.group('year'))
@@ -3373,7 +2129,7 @@ def _parse_date_w3dtf(dateString):
day = 31
elif jday < julian:
if day + diff < 28:
- day = day + diff
+ day = day + diff
else:
month = month + 1
return year, month, day
@@ -3427,558 +2183,414 @@ def _parse_date_w3dtf(dateString):
__date_re = ('(?P\d\d\d\d)'
'(?:(?P-|)'
- '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?'
- '|(?P\d\d\d)))?')
- __tzd_re = ' ?(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)?'
+ '(?:(?P\d\d\d)'
+ '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?')
+ __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)'
+ __tzd_rx = re.compile(__tzd_re)
__time_re = ('(?P\d\d)(?P:|)(?P\d\d)'
- '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?'
+ '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?'
+ __tzd_re)
- __datetime_re = '%s(?:[T ]%s)?' % (__date_re, __time_re)
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
__datetime_rx = re.compile(__datetime_re)
m = __datetime_rx.match(dateString)
- if (m is None) or (m.group() != dateString):
- return
+ if (m is None) or (m.group() != dateString): return
gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
- if gmt[0] == 0:
- return
+ if gmt[0] == 0: return
return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
registerDateHandler(_parse_date_w3dtf)
-# Define the strings used by the RFC822 datetime parser
-_rfc822_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
- 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
-_rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
-
-# Only the first three letters of the month name matter
-_rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months))
-# The year may be 2 or 4 digits; capture the century if it exists
-_rfc822_year = "(?P(?:\d{2})?\d{2})"
-_rfc822_day = "(?P *\d{1,2})"
-_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year)
-
-_rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?"
-_rfc822_tz = "(?Put|gmt(?:[+-]\d{2}:\d{2})?|[aecmp][sd]?t|[zamny]|[+-]\d{4})"
-_rfc822_tznames = {
- 'ut': 0, 'gmt': 0, 'z': 0,
- 'adt': -3, 'ast': -4, 'at': -4,
- 'edt': -4, 'est': -5, 'et': -5,
- 'cdt': -5, 'cst': -6, 'ct': -6,
- 'mdt': -6, 'mst': -7, 'mt': -7,
- 'pdt': -7, 'pst': -8, 'pt': -8,
- 'a': -1, 'n': 1,
- 'm': -12, 'y': 12,
- }
-# The timezone may be prefixed by 'Etc/'
-_rfc822_time = "%s (?:etc/)?%s" % (_rfc822_hour, _rfc822_tz)
-
-_rfc822_dayname = "(?P%s)" % ('|'.join(_rfc822_daynames))
-_rfc822_match = re.compile(
- "(?:%s, )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date, _rfc822_time)
-).match
-
-def _parse_date_group_rfc822(m):
- # Calculate a date and timestamp
- for k in ('year', 'day', 'hour', 'minute', 'second'):
- m[k] = int(m[k])
- m['month'] = _rfc822_months.index(m['month']) + 1
- # If the year is 2 digits, assume everything in the 90's is the 1990's
- if m['year'] < 100:
- m['year'] += (1900, 2000)[m['year'] < 90]
- stamp = datetime.datetime(*[m[i] for i in
- ('year', 'month', 'day', 'hour', 'minute', 'second')])
-
- # Use the timezone information to calculate the difference between
- # the given date and timestamp and Universal Coordinated Time
- tzhour = 0
- tzmin = 0
- if m['tz'] and m['tz'].startswith('gmt'):
- # Handle GMT and GMT+hh:mm timezone syntax (the trailing
- # timezone info will be handled by the next `if` block)
- m['tz'] = ''.join(m['tz'][3:].split(':')) or 'gmt'
- if not m['tz']:
- pass
- elif m['tz'].startswith('+'):
- tzhour = int(m['tz'][1:3])
- tzmin = int(m['tz'][3:])
- elif m['tz'].startswith('-'):
- tzhour = int(m['tz'][1:3]) * -1
- tzmin = int(m['tz'][3:]) * -1
- else:
- tzhour = _rfc822_tznames[m['tz']]
- delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
-
- # Return the date and timestamp in UTC
- return (stamp - delta).utctimetuple()
-
-def _parse_date_rfc822(dt):
- """Parse RFC 822 dates and times, with one minor
- difference: years may be 4DIGIT or 2DIGIT.
- http://tools.ietf.org/html/rfc822#section-5"""
- try:
- m = _rfc822_match(dt.lower()).groupdict(0)
- except AttributeError:
- return None
-
- return _parse_date_group_rfc822(m)
-registerDateHandler(_parse_date_rfc822)
-
-def _parse_date_rfc822_grubby(dt):
- """Parse date format similar to RFC 822, but
- the comma after the dayname is optional and
- month/day are inverted"""
- _rfc822_date_grubby = "%s %s %s" % (_rfc822_month, _rfc822_day, _rfc822_year)
- _rfc822_match_grubby = re.compile(
- "(?:%s[,]? )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date_grubby, _rfc822_time)
- ).match
-
- try:
- m = _rfc822_match_grubby(dt.lower()).groupdict(0)
- except AttributeError:
- return None
-
- return _parse_date_group_rfc822(m)
-registerDateHandler(_parse_date_rfc822_grubby)
-
-def _parse_date_asctime(dt):
- """Parse asctime-style dates"""
- dayname, month, day, remainder = dt.split(None, 3)
- # Convert month and day into zero-padded integers
- month = '%02i ' % (_rfc822_months.index(month.lower()) + 1)
- day = '%02i ' % (int(day),)
- dt = month + day + remainder
- return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, )
-registerDateHandler(_parse_date_asctime)
-
-def _parse_date_perforce(aDateString):
- """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
- # Fri, 2006/09/15 08:19:53 EDT
- _my_date_pattern = re.compile( \
- r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
-
- m = _my_date_pattern.search(aDateString)
- if m is None:
- return None
- dow, year, month, day, hour, minute, second, tz = m.groups()
- months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
- dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
+def _parse_date_rfc822(dateString):
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
+ data = dateString.split()
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
+ del data[0]
+ if len(data) == 4:
+ s = data[3]
+ i = s.find('+')
+ if i > 0:
+ data[3:] = [s[:i], s[i+1:]]
+ else:
+ data.append('')
+ dateString = " ".join(data)
+ if len(data) < 5:
+ dateString += ' 00:00:00 GMT'
tm = rfc822.parsedate_tz(dateString)
if tm:
return time.gmtime(rfc822.mktime_tz(tm))
-registerDateHandler(_parse_date_perforce)
+# rfc822.py defines several time zones, but we define some extra ones.
+# 'ET' is equivalent to 'EST', etc.
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
+rfc822._timezones.update(_additional_timezones)
+registerDateHandler(_parse_date_rfc822)
def _parse_date(dateString):
'''Parses a variety of date formats into a 9-tuple in GMT'''
- if not dateString:
- return None
for handler in _date_handlers:
try:
date9tuple = handler(dateString)
- except (KeyError, OverflowError, ValueError):
- continue
- if not date9tuple:
- continue
- if len(date9tuple) != 9:
- continue
- return date9tuple
+ if not date9tuple: continue
+ if len(date9tuple) != 9:
+ if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
+ raise ValueError
+ map(int, date9tuple)
+ return date9tuple
+ except Exception, e:
+ if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
+ pass
return None
-# Each marker represents some of the characters of the opening XML
-# processing instruction ('
-RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
-
-# Capture the value of the XML processing instruction's encoding attribute.
-# Example:
-RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
-
-def convert_to_utf8(http_headers, data):
- '''Detect and convert the character encoding to UTF-8.
+def _getCharacterEncoding(http_headers, xml_data):
+ '''Get the character encoding of the XML document
http_headers is a dictionary
- data is a raw string (not Unicode)'''
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds, it's not even funny.
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to 'utf-8' if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to 'us-ascii' if not specified.
- # This is so much trickier than it sounds, it's not even funny.
- # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
- # is application/xml, application/*+xml,
- # application/xml-external-parsed-entity, or application/xml-dtd,
- # the encoding given in the charset parameter of the HTTP Content-Type
- # takes precedence over the encoding given in the XML prefix within the
- # document, and defaults to 'utf-8' if neither are specified. But, if
- # the HTTP Content-Type is text/xml, text/*+xml, or
- # text/xml-external-parsed-entity, the encoding given in the XML prefix
- # within the document is ALWAYS IGNORED and only the encoding given in
- # the charset parameter of the HTTP Content-Type header should be
- # respected, and it defaults to 'us-ascii' if not specified.
+ Furthermore, discussion on the atom-syntax mailing list with the
+ author of RFC 3023 leads me to the conclusion that any document
+ served with a Content-Type of text/* and no charset parameter
+ must be treated as us-ascii. (We now do this.) And also that it
+ must always be flagged as non-well-formed. (We now do this too.)
+
+ If Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ 'iso-8859-1' as per the HTTP specification (RFC 2616).
+
+ Then, assuming we didn't find a character encoding in the HTTP headers
+ (and the HTTP Content-type allowed us to look in the body), we need
+ to sniff the first few bytes of the XML data and try to determine
+ whether the encoding is ASCII-compatible. Section F of the XML
+ specification shows the way here:
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
- # Furthermore, discussion on the atom-syntax mailing list with the
- # author of RFC 3023 leads me to the conclusion that any document
- # served with a Content-Type of text/* and no charset parameter
- # must be treated as us-ascii. (We now do this.) And also that it
- # must always be flagged as non-well-formed. (We now do this too.)
+ If the sniffed encoding is not ASCII-compatible, we need to make it
+ ASCII compatible so that we can sniff further into the XML declaration
+ to find the encoding attribute, which will tell us the true encoding.
- # If Content-Type is unspecified (input was local file or non-HTTP source)
- # or unrecognized (server just got it totally wrong), then go by the
- # encoding given in the XML prefix of the document and default to
- # 'iso-8859-1' as per the HTTP specification (RFC 2616).
+ Of course, none of this guarantees that we will be able to parse the
+ feed in the declared character encoding (assuming it was declared
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot;
+ you should definitely install them if you can.
+ http://cjkpython.i18n.org/
+ '''
- # Then, assuming we didn't find a character encoding in the HTTP headers
- # (and the HTTP Content-type allowed us to look in the body), we need
- # to sniff the first few bytes of the XML data and try to determine
- # whether the encoding is ASCII-compatible. Section F of the XML
- # specification shows the way here:
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ def _parseHTTPContentType(content_type):
+ '''takes HTTP Content-Type header and returns (content type, charset)
- # If the sniffed encoding is not ASCII-compatible, we need to make it
- # ASCII compatible so that we can sniff further into the XML declaration
- # to find the encoding attribute, which will tell us the true encoding.
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ '''
+ content_type = content_type or ''
+ content_type, params = cgi.parse_header(content_type)
+ return content_type, params.get('charset', '').replace("'", '')
- # Of course, none of this guarantees that we will be able to parse the
- # feed in the declared character encoding (assuming it was declared
- # correctly, which many are not). iconv_codec can help a lot;
- # you should definitely install it if you can.
- # http://cjkpython.i18n.org/
-
- bom_encoding = ''
+ sniffed_xml_encoding = ''
xml_encoding = ''
- rfc3023_encoding = ''
-
- # Look at the first few bytes of the document to guess what
- # its encoding may be. We only need to decode enough of the
- # document that we can use an ASCII-compatible regular
- # expression to search for an XML encoding declaration.
- # The heuristic follows the XML specification, section F:
+ true_encoding = ''
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
+ # Must sniff for non-ASCII-compatible character encodings before
+ # searching for XML declaration. This heuristic is defined in
+ # section F of the XML specification:
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
- # Check for BOMs first.
- if data[:4] == codecs.BOM_UTF32_BE:
- bom_encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == codecs.BOM_UTF32_LE:
- bom_encoding = 'utf-32le'
- data = data[4:]
- elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
- bom_encoding = 'utf-16be'
- data = data[2:]
- elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
- bom_encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == codecs.BOM_UTF8:
- bom_encoding = 'utf-8'
- data = data[3:]
- # Check for the characters '= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ # ASCII-compatible
+ pass
+ xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ except:
xml_encoding_match = None
- else:
- xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
-
if xml_encoding_match:
- xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
- # Normalize the xml_encoding if necessary.
- if bom_encoding and (xml_encoding in (
- 'u16', 'utf-16', 'utf16', 'utf_16',
- 'u32', 'utf-32', 'utf32', 'utf_32',
- 'iso-10646-ucs-2', 'iso-10646-ucs-4',
- 'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
- )):
- xml_encoding = bom_encoding
-
- # Find the HTTP Content-Type and, hopefully, a character
- # encoding provided by the server. The Content-Type is used
- # to choose the "correct" encoding among the BOM encoding,
- # XML declaration encoding, and HTTP encoding, following the
- # heuristic defined in RFC 3023.
- http_content_type = http_headers.get('content-type') or ''
- http_content_type, params = cgi.parse_header(http_content_type)
- http_encoding = params.get('charset', '').replace("'", "")
- if not isinstance(http_encoding, str):
- http_encoding = http_encoding.decode('utf-8', 'ignore')
-
+ xml_encoding = xml_encoding_match.groups()[0].lower()
+ if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
acceptable_content_type = 0
- application_content_types = ('application/xml', 'application/xml-dtd',
- 'application/xml-external-parsed-entity')
+ application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
if (http_content_type in application_content_types) or \
- (http_content_type.startswith('application/') and
- http_content_type.endswith('+xml')):
+ (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
acceptable_content_type = 1
- rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
+ true_encoding = http_encoding or xml_encoding or 'utf-8'
elif (http_content_type in text_content_types) or \
- (http_content_type.startswith('text/') and
- http_content_type.endswith('+xml')):
+ (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
acceptable_content_type = 1
- rfc3023_encoding = http_encoding or 'us-ascii'
+ true_encoding = http_encoding or 'us-ascii'
elif http_content_type.startswith('text/'):
- rfc3023_encoding = http_encoding or 'us-ascii'
- elif http_headers and 'content-type' not in http_headers:
- rfc3023_encoding = xml_encoding or 'iso-8859-1'
+ true_encoding = http_encoding or 'us-ascii'
+ elif http_headers and (not http_headers.has_key('content-type')):
+ true_encoding = xml_encoding or 'iso-8859-1'
else:
- rfc3023_encoding = xml_encoding or 'utf-8'
- # gb18030 is a superset of gb2312, so always replace gb2312
- # with gb18030 for greater compatibility.
- if rfc3023_encoding.lower() == 'gb2312':
- rfc3023_encoding = 'gb18030'
- if xml_encoding.lower() == 'gb2312':
- xml_encoding = 'gb18030'
+ true_encoding = xml_encoding or 'utf-8'
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+
+def _toUTF8(data, encoding):
+ '''Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ '''
+ if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
+ if _debug:
+ sys.stderr.write('stripping BOM\n')
+ if encoding != 'utf-16be':
+ sys.stderr.write('trying utf-16be instead\n')
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
+ if _debug:
+ sys.stderr.write('stripping BOM\n')
+ if encoding != 'utf-16le':
+ sys.stderr.write('trying utf-16le instead\n')
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ if _debug:
+ sys.stderr.write('stripping BOM\n')
+ if encoding != 'utf-8':
+ sys.stderr.write('trying utf-8 instead\n')
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ if _debug:
+ sys.stderr.write('stripping BOM\n')
+ if encoding != 'utf-32be':
+ sys.stderr.write('trying utf-32be instead\n')
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ if _debug:
+ sys.stderr.write('stripping BOM\n')
+ if encoding != 'utf-32le':
+ sys.stderr.write('trying utf-32le instead\n')
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
+ declmatch = re.compile('^<\?xml[^>]*?>')
+ newdecl = ''''''
+ if declmatch.search(newdata):
+ newdata = declmatch.sub(newdecl, newdata)
+ else:
+ newdata = newdecl + u'\n' + newdata
+ return newdata.encode('utf-8')
+
+def _stripDoctype(data):
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be 'rss091n' or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ '''
+ entity_pattern = re.compile(r']*?)>', re.MULTILINE)
+ data = entity_pattern.sub('', data)
+ doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
+ doctype_results = doctype_pattern.findall(data)
+ doctype = doctype_results and doctype_results[0] or ''
+ if doctype.lower().count('netscape'):
+ version = 'rss091n'
+ else:
+ version = None
+ data = doctype_pattern.sub('', data)
+ return version, data
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
+ '''Parse a feed from a URL, file, stream, or string'''
+ result = FeedParserDict()
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ if _XML_AVAILABLE:
+ result['bozo'] = 0
+ if type(handlers) == types.InstanceType:
+ handlers = [handlers]
+ try:
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
+ data = f.read()
+ except Exception, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = ''
+ f = None
+
+ # if feed is gzip-compressed, decompress it
+ if f and data and hasattr(f, 'headers'):
+ if gzip and f.headers.get('content-encoding', '') == 'gzip':
+ try:
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
+ except Exception, e:
+ # Some feeds claim to be gzipped but they're not, so
+ # we get garbage. Ideally, we should re-request the
+ # feed without the 'Accept-encoding: gzip' header,
+ # but we don't.
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = ''
+ elif zlib and f.headers.get('content-encoding', '') == 'deflate':
+ try:
+ data = zlib.decompress(data, -zlib.MAX_WBITS)
+ except Exception, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = ''
+
+ # save HTTP headers
+ if hasattr(f, 'info'):
+ info = f.info()
+ result['etag'] = info.getheader('ETag')
+ last_modified = info.getheader('Last-Modified')
+ if last_modified:
+ result['modified'] = _parse_date(last_modified)
+ if hasattr(f, 'url'):
+ result['href'] = f.url
+ result['status'] = 200
+ if hasattr(f, 'status'):
+ result['status'] = f.status
+ if hasattr(f, 'headers'):
+ result['headers'] = f.headers.dict
+ if hasattr(f, 'close'):
+ f.close()
# there are four encodings to keep track of:
# - http_encoding is the encoding declared in the Content-Type HTTP header
# - xml_encoding is the encoding declared in the '''
- if RE_XML_DECLARATION.search(data):
- data = RE_XML_DECLARATION.sub(new_declaration, data)
- else:
- data = new_declaration + '\n' + data
- data = data.encode('utf-8')
- break
- # if still no luck, give up
- if not known_encoding:
- error = CharacterEncodingUnknown(
- 'document encoding unknown, I tried ' +
- '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
- (rfc3023_encoding, xml_encoding))
- rfc3023_encoding = ''
- elif proposed_encoding != rfc3023_encoding:
- error = CharacterEncodingOverride(
- 'document declared as %s, but parsed as %s' %
- (rfc3023_encoding, proposed_encoding))
- rfc3023_encoding = proposed_encoding
-
- return data, rfc3023_encoding, error
-
-# Match XML entity declarations.
-# Example:
-RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
-
-# Match XML DOCTYPE declarations.
-# Example:
-RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE)
-
-# Match safe entity declarations.
-# This will allow hexadecimal character references through,
-# as well as text, but not arbitrary nested entities.
-# Example: cubed "³"
-# Example: copyright "(C)"
-# Forbidden: explode1 "&explode2;&explode2;"
-RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(\w+;|[^&"]*)"'))
-
-def replace_doctype(data):
- '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
-
- rss_version may be 'rss091n' or None
- stripped_data is the same XML document with a replaced DOCTYPE
- '''
-
- # Divide the document into two groups by finding the location
- # of the first element that doesn't begin with '' or '\n\n]>')
- data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
-
- # Precompute the safe entities for the loose parser.
- safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
- for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
- return version, data, safe_entities
-
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
- '''Parse a feed from a URL, file, stream, or string.
-
- request_headers, if given, is a dict from http header name to value to add
- to the request; this overrides internally generated values.
- '''
-
- if handlers is None:
- handlers = []
- if request_headers is None:
- request_headers = {}
- if response_headers is None:
- response_headers = {}
-
- result = FeedParserDict()
- result['feed'] = FeedParserDict()
- result['entries'] = []
- result['bozo'] = 0
- if not isinstance(handlers, list):
- handlers = [handlers]
- try:
- f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
- data = f.read()
- except Exception as e:
+ bozo_message = 'no Content-type specified'
result['bozo'] = 1
- result['bozo_exception'] = e
- data = None
- f = None
+ result['bozo_exception'] = NonXMLContentType(bozo_message)
+
+ result['version'], data = _stripDoctype(data)
- if hasattr(f, 'headers'):
- result['headers'] = dict(f.headers)
- # overwrite existing headers using response_headers
- if 'headers' in result:
- result['headers'].update(response_headers)
- elif response_headers:
- result['headers'] = copy.deepcopy(response_headers)
+ baseuri = http_headers.get('content-location', result.get('href'))
+ baselang = http_headers.get('content-language', None)
- # lowercase all of the HTTP headers for comparisons per RFC 2616
- if 'headers' in result:
- http_headers = dict((k.lower(), v) for k, v in list(result['headers'].items()))
- else:
- http_headers = {}
-
- # if feed is gzip-compressed, decompress it
- if f and data and http_headers:
- if gzip and 'gzip' in http_headers.get('content-encoding', ''):
- try:
- data = gzip.GzipFile(fileobj=_StringIO(data)).read()
- except (IOError, struct.error) as e:
- # IOError can occur if the gzip header is bad.
- # struct.error can occur if the data is damaged.
- result['bozo'] = 1
- result['bozo_exception'] = e
- if isinstance(e, struct.error):
- # A gzip header was found but the data is corrupt.
- # Ideally, we should re-request the feed without the
- # 'Accept-encoding: gzip' header, but we don't.
- data = None
- elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
- try:
- data = zlib.decompress(data)
- except zlib.error as e:
- try:
- # The data may have no headers and no checksum.
- data = zlib.decompress(data, -15)
- except zlib.error as e:
- result['bozo'] = 1
- result['bozo_exception'] = e
-
- # save HTTP headers
- if http_headers:
- if 'etag' in http_headers:
- etag = http_headers.get('etag', '')
- if not isinstance(etag, str):
- etag = etag.decode('utf-8', 'ignore')
- if etag:
- result['etag'] = etag
- if 'last-modified' in http_headers:
- modified = http_headers.get('last-modified', '')
- if modified:
- result['modified'] = modified
- result['modified_parsed'] = _parse_date(modified)
- if hasattr(f, 'url'):
- if not isinstance(f.url, str):
- result['href'] = f.url.decode('utf-8', 'ignore')
- else:
- result['href'] = f.url
- result['status'] = 200
- if hasattr(f, 'status'):
- result['status'] = f.status
- if hasattr(f, 'close'):
- f.close()
-
- if data is None:
- return result
-
- # Stop processing if the server sent HTTP 304 Not Modified.
- if getattr(f, 'code', 0) == 304:
+ # if server sent 304, we're done
+ if result.get('status', 0) == 304:
result['version'] = ''
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
'so the server sent no data. This is a feature, not a bug!'
return result
- data, result['encoding'], error = convert_to_utf8(http_headers, data)
- use_strict_parser = result['encoding'] and True or False
- if error is not None:
+ # if there was a problem downloading, we're done
+ if not data:
+ return result
+
+ # determine character encoding
+ use_strict_parser = 0
+ known_encoding = 0
+ tried_encodings = []
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
+ if not proposed_encoding: continue
+ if proposed_encoding in tried_encodings: continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ known_encoding = use_strict_parser = 1
+ break
+ except:
+ pass
+ # if no luck and we have auto-detection library, try that
+ if (not known_encoding) and chardet:
+ try:
+ proposed_encoding = chardet.detect(data)['encoding']
+ if proposed_encoding and (proposed_encoding not in tried_encodings):
+ tried_encodings.append(proposed_encoding)
+ data = _toUTF8(data, proposed_encoding)
+ known_encoding = use_strict_parser = 1
+ except:
+ pass
+ # if still no luck and we haven't tried utf-8 yet, try that
+ if (not known_encoding) and ('utf-8' not in tried_encodings):
+ try:
+ proposed_encoding = 'utf-8'
+ tried_encodings.append(proposed_encoding)
+ data = _toUTF8(data, proposed_encoding)
+ known_encoding = use_strict_parser = 1
+ except:
+ pass
+ # if still no luck and we haven't tried windows-1252 yet, try that
+ if (not known_encoding) and ('windows-1252' not in tried_encodings):
+ try:
+ proposed_encoding = 'windows-1252'
+ tried_encodings.append(proposed_encoding)
+ data = _toUTF8(data, proposed_encoding)
+ known_encoding = use_strict_parser = 1
+ except:
+ pass
+ # if still no luck, give up
+ if not known_encoding:
result['bozo'] = 1
- result['bozo_exception'] = error
-
- result['version'], data, entities = replace_doctype(data)
-
- # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
- contentloc = http_headers.get('content-location', '')
- href = result.get('href', '')
- baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
-
- baselang = http_headers.get('content-language', None)
- if not isinstance(baselang, str) and baselang is not None:
- baselang = baselang.decode('utf-8', 'ignore')
+ result['bozo_exception'] = CharacterEncodingUnknown( \
+ 'document encoding unknown, I tried ' + \
+ '%s, %s, utf-8, and windows-1252 but nothing worked' % \
+ (result['encoding'], xml_encoding))
+ result['encoding'] = ''
+ elif proposed_encoding != result['encoding']:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingOverride( \
+ 'documented declared as %s, but parsed as %s' % \
+ (result['encoding'], proposed_encoding))
+ result['encoding'] = proposed_encoding
if not _XML_AVAILABLE:
use_strict_parser = 0
@@ -3987,26 +2599,260 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
- try:
- # disable downloading external doctype references, if possible
- saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
- except xml.sax.SAXNotSupportedException:
- pass
saxparser.setContentHandler(feedparser)
saxparser.setErrorHandler(feedparser)
source = xml.sax.xmlreader.InputSource()
source.setByteStream(_StringIO(data))
+ if hasattr(saxparser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
try:
saxparser.parse(source)
- except xml.sax.SAXException as e:
+ except Exception, e:
+ if _debug:
+ import traceback
+ traceback.print_stack()
+ traceback.print_exc()
+ sys.stderr.write('xml parsing failed\n')
result['bozo'] = 1
result['bozo_exception'] = feedparser.exc or e
use_strict_parser = 0
- if not use_strict_parser and _SGML_AVAILABLE:
- feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
- feedparser.feed(data.decode('utf-8', 'replace'))
+ if not use_strict_parser:
+ feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
+ feedparser.feed(data)
result['feed'] = feedparser.feeddata
result['entries'] = feedparser.entries
result['version'] = result['version'] or feedparser.version
result['namespaces'] = feedparser.namespacesInUse
return result
+
+if __name__ == '__main__':
+ if not sys.argv[1:]:
+ print __doc__
+ sys.exit(0)
+ else:
+ urls = sys.argv[1:]
+ zopeCompatibilityHack()
+ from pprint import pprint
+ for url in urls:
+ print url
+ print
+ result = parse(url)
+ pprint(result)
+ print
+
+#REVISION HISTORY
+#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
+# added Simon Fell's test suite
+#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
+#2.0 - 10/19/2002
+# JD - use inchannel to watch out for image and textinput elements which can
+# also contain title, link, and description elements
+# JD - check for isPermaLink='false' attribute on guid elements
+# JD - replaced openAnything with open_resource supporting ETag and
+# If-Modified-Since request headers
+# JD - parse now accepts etag, modified, agent, and referrer optional
+# arguments
+# JD - modified parse to return a dictionary instead of a tuple so that any
+# etag or modified information can be returned and cached by the caller
+#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
+# because of etag/modified, return the old etag/modified to the caller to
+# indicate why nothing is being returned
+#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
+# useless. Fixes the problem JD was addressing by adding it.
+#2.1 - 11/14/2002 - MAP - added gzip support
+#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
+# start_admingeneratoragent is an example of how to handle elements with
+# only attributes, no content.
+#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
+# also, make sure we send the User-Agent even if urllib2 isn't available.
+# Match any variation of backend.userland.com/rss namespace.
+#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
+#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
+# snapshot of July 1 ; changed
+# project name
+#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
+# removed unnecessary urllib code -- urllib2 should always be available anyway;
+# return actual url, status, and full HTTP headers (as result['url'],
+# result['status'], and result['headers']) if parsing a remote feed over HTTP --
+# this should pass all the HTTP tests at ;
+# added the latest namespace-of-the-week for RSS 2.0
+#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
+# User-Agent (otherwise urllib2 sends two, which confuses some servers)
+#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
+# inline and as used in some RSS 2.0 feeds
+#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
+# textInput, and also to return the character encoding (if specified)
+#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
+# nested divs within content (JohnD); fixed missing sys import (JohanS);
+# fixed regular expression to capture XML character encoding (Andrei);
+# added support for Atom 0.3-style links; fixed bug with textInput tracking;
+# added support for cloud (MartijnP); added support for multiple
+# category/dc:subject (MartijnP); normalize content model: 'description' gets
+# description (which can come from description, summary, or full content if no
+# description), 'content' gets dict of base/language/type/value (which can come
+# from content:encoded, xhtml:body, content, or fullitem);
+# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
+# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
+# element is not in default namespace (like Pocketsoap feed);
+# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
+# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
+# description, xhtml:body, content, content:encoded, title, subtitle,
+# summary, info, tagline, and copyright; added support for pingback and
+# trackback namespaces
+#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
+# namespaces, as opposed to 2.6 when I said I did but didn't really;
+# sanitize HTML markup within some elements; added mxTidy support (if
+# installed) to tidy HTML markup within some elements; fixed indentation
+# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
+# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
+# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
+# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
+# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
+#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
+# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
+# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
+#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in
+# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
+# fixed relative URI processing for guid (skadz); added ICBM support; added
+# base64 support
+#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
+# blogspot.com sites); added _debug variable
+#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
+#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
+# added several new supported namespaces; fixed bug tracking naked markup in
+# description; added support for enclosure; added support for source; re-added
+# support for cloud which got dropped somehow; added support for expirationDate
+#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
+# xml:base URI, one for documents that don't define one explicitly and one for
+# documents that define an outer and an inner xml:base that goes out of scope
+# before the end of the document
+#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
+#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
+# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
+# added support for creativeCommons:license and cc:license; added support for
+# full Atom content model in title, tagline, info, copyright, summary; fixed bug
+# with gzip encoding (not always telling server we support it when we do)
+#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
+# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
+# contains name + email address
+#3.0b8 - 1/28/2004 - MAP - added support for contributor
+#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
+# support for summary
+#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
+# xml.util.iso8601
+#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
+# dangerous markup; fiddled with decodeEntities (not right); liberalized
+# date parsing even further
+#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
+# added support to Atom 0.2 subtitle; added support for Atom content model
+# in copyright; better sanitizing of dangerous HTML elements with end tags
+# (script, frameset)
+#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
+# etc.) in embedded markup, in either HTML or XHTML form (
,
,
)
+#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
+# Python 2.1
+#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
+# fixed bug capturing author and contributor URL; fixed bug resolving relative
+# links in author and contributor URL; fixed bug resolvin relative links in
+# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
+# namespace tests, and included them permanently in the test suite with his
+# permission; fixed namespace handling under Python 2.1
+#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
+#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
+#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
+# use libxml2 (if available)
+#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
+# name was in parentheses; removed ultra-problematic mxTidy support; patch to
+# workaround crash in PyXML/expat when encountering invalid entities
+# (MarkMoraes); support for textinput/textInput
+#3.0b20 - 4/7/2004 - MAP - added CDF support
+#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
+#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
+# results dict; changed results dict to allow getting values with results.key
+# as well as results[key]; work around embedded illformed HTML with half
+# a DOCTYPE; work around malformed Content-Type header; if character encoding
+# is wrong, try several common ones before falling back to regexes (if this
+# works, bozo_exception is set to CharacterEncodingOverride); fixed character
+# encoding issues in BaseHTMLProcessor by tracking encoding and converting
+# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
+# convert each value in results to Unicode (if possible), even if using
+# regex-based parsing
+#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
+# high-bit characters in attributes in embedded HTML in description (thanks
+# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
+# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
+# about a mapped key
+#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
+# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
+# cause the same encoding to be tried twice (even if it failed the first time);
+# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
+# better textinput and image tracking in illformed RSS 1.0 feeds
+#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
+# my blink tag tests
+#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
+# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
+# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
+# added support for image; refactored parse() fallback logic to try other
+# encodings if SAX parsing fails (previously it would only try other encodings
+# if re-encoding failed); remove unichr madness in normalize_attrs now that
+# we're properly tracking encoding in and out of BaseHTMLProcessor; set
+# feed.language from root-level xml:lang; set entry.id from rdf:about;
+# send Accept header
+#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
+# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
+# windows-1252); fixed regression that could cause the same encoding to be
+# tried twice (even if it failed the first time)
+#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
+# recover from malformed content-type header parameter with no equals sign
+# ('text/xml; charset:iso-8859-1')
+#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
+# to Unicode equivalents in illformed feeds (aaronsw); added and
+# passed tests for converting character entities to Unicode equivalents
+# in illformed feeds (aaronsw); test for valid parsers when setting
+# XML_AVAILABLE; make version and encoding available when server returns
+# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
+# digest auth or proxy support); add code to parse username/password
+# out of url and send as basic authentication; expose downloading-related
+# exceptions in bozo_exception (aaronsw); added __contains__ method to
+# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
+#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
+# convert feed to UTF-8 before passing to XML parser; completely revamped
+# logic for determining character encoding and attempting XML parsing
+# (much faster); increased default timeout to 20 seconds; test for presence
+# of Location header on redirects; added tests for many alternate character
+# encodings; support various EBCDIC encodings; support UTF-16BE and
+# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
+# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
+# XML parsers are available; added support for 'Content-encoding: deflate';
+# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
+# are available
+#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
+# problem tracking xml:base and xml:lang if element declares it, child
+# doesn't, first grandchild redeclares it, and second grandchild doesn't;
+# refactored date parsing; defined public registerDateHandler so callers
+# can add support for additional date formats at runtime; added support
+# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
+# zopeCompatibilityHack() which turns FeedParserDict into a regular
+# dictionary, required for Zope compatibility, and also makes command-
+# line debugging easier because pprint module formats real dictionaries
+# better than dictionary-like objects; added NonXMLContentType exception,
+# which is stored in bozo_exception when a feed is served with a non-XML
+# media type such as 'text/plain'; respect Content-Language as default
+# language if not xml:lang is present; cloud dict is now FeedParserDict;
+# generator dict is now FeedParserDict; better tracking of xml:lang,
+# including support for xml:lang='' to unset the current language;
+# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
+# namespace; don't overwrite final status on redirects (scenarios:
+# redirecting to a URL that returns 304, redirecting to a URL that
+# redirects to another URL with a different type of redirect); add
+# support for HTTP 303 redirects
+#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
+# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
+# support for Atom 1.0; support for iTunes extensions; new 'tags' for
+# categories/keywords/etc. as array of dict
+# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
+# terminology; parse RFC 822-style dates with no time; lots of other
+# bug fixes
+#4.1 - MAP - removed socket timeout; added support for chardet library
diff --git a/libs/magic.py b/libs/magic.py
index 10685ac..6c30a0c 100644
--- a/libs/magic.py
+++ b/libs/magic.py
@@ -1,4 +1,7 @@
"""
+Adam Hupp (adam@hupp.org)
+http://github.com/ahupp/python-magic
+
magic is a wrapper around the libmagic file identification library.
See README for more information.
@@ -17,12 +20,9 @@ Usage:
"""
-import sys
-import glob
import os.path
import ctypes
import ctypes.util
-import threading
from ctypes import c_char_p, c_int, c_size_t, c_void_p
@@ -34,112 +34,74 @@ class Magic:
"""
- def __init__(self, mime=False, magic_file=None, mime_encoding=False,
- keep_going=False):
+ def __init__(self, mime=False, magic_file=None, mime_encoding=False):
"""
Create a new libmagic wrapper.
mime - if True, mimetypes are returned instead of textual descriptions
mime_encoding - if True, codec is returned
magic_file - use a mime database other than the system default
- keep_going - don't stop at the first match, keep going
- """
- self.flags = MAGIC_NONE
- if mime:
- self.flags |= MAGIC_MIME
- elif mime_encoding:
- self.flags |= MAGIC_MIME_ENCODING
- if keep_going:
- self.flags |= MAGIC_CONTINUE
- self.cookie = magic_open(self.flags)
+ """
+ flags = MAGIC_NONE
+ if mime:
+ flags |= MAGIC_MIME
+ elif mime_encoding:
+ flags |= MAGIC_MIME_ENCODING
+
+ self.cookie = magic_open(flags)
magic_load(self.cookie, magic_file)
- self.thread = threading.currentThread()
def from_buffer(self, buf):
"""
Identify the contents of `buf`
"""
- self._thread_check()
- try:
- return magic_buffer(self.cookie, buf)
- except MagicException as e:
- return self._handle509Bug(e)
+ return magic_buffer(self.cookie, buf)
def from_file(self, filename):
"""
Identify the contents of file `filename`
raises IOError if the file does not exist
"""
- self._thread_check()
+
if not os.path.exists(filename):
raise IOError("File does not exist: " + filename)
- try:
- return magic_file(self.cookie, filename)
- except MagicException as e:
- return self._handle509Bug(e)
- def _handle509Bug(self, e):
- # libmagic 5.09 has a bug where it might mail to identify the
- # mimetype of a file and returns null from magic_file (and
- # likely _buffer), but also does not return an error message.
- if e.message is None and (self.flags & MAGIC_MIME):
- return "application/octet-stream"
-
- def _thread_check(self):
- if self.thread != threading.currentThread():
- raise Exception('attempting to use libmagic on multiple threads will '
- 'end in SEGV. Prefer to use the module functions '
- 'from_file or from_buffer, or carefully manage direct '
- 'use of the Magic class')
+ return magic_file(self.cookie, filename)
def __del__(self):
- # no _thread_check here because there can be no other
- # references to this object at this point.
-
- # during shutdown magic_close may have been cleared already so
- # make sure it exists before using it.
-
- # the self.cookie check should be unnessary and was an
- # incorrect fix for a threading problem, however I'm leaving
- # it in because it's harmless and I'm slightly afraid to
- # remove it.
- if self.cookie and magic_close:
+ if self.cookie:
magic_close(self.cookie)
self.cookie = None
+_magic_mime = None
+_magic = None
-instances = threading.local()
+def _get_magic_mime():
+ global _magic_mime
+ if not _magic_mime:
+ _magic_mime = Magic(mime=True)
+ return _magic_mime
+
+def _get_magic():
+ global _magic
+ if not _magic:
+ _magic = Magic()
+ return _magic
def _get_magic_type(mime):
- i = instances.__dict__.get(mime)
- if i is None:
- i = instances.__dict__[mime] = Magic(mime=mime)
- return i
+ if mime:
+ return _get_magic_mime()
+ else:
+ return _get_magic()
def from_file(filename, mime=False):
- """"
- Accepts a filename and returns the detected filetype. Return
- value is the mimetype if mime=True, otherwise a human readable
- name.
-
- >>> magic.from_file("testdata/test.pdf", mime=True)
- 'application/pdf'
- """
m = _get_magic_type(mime)
return m.from_file(filename)
def from_buffer(buffer, mime=False):
- """
- Accepts a binary string and returns the detected filetype. Return
- value is the mimetype if mime=True, otherwise a human readable
- name.
-
- >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
- 'PDF document, version 1.2'
- """
m = _get_magic_type(mime)
return m.from_buffer(buffer)
@@ -148,22 +110,19 @@ def from_buffer(buffer, mime=False):
libmagic = None
# Let's try to find magic or magic1
-dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') or ctypes.util.find_library('cygmagic-1')
+dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1')
# This is necessary because find_library returns None if it doesn't find the library
if dll:
libmagic = ctypes.CDLL(dll)
if not libmagic or not libmagic._name:
- platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
- '/usr/local/lib/libmagic.dylib'] +
- # Assumes there will only be one version installed
- glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'),
- 'win32': ['magic1.dll','cygmagic-1.dll']}
- for dll in platform_to_lib.get(sys.platform, []):
+ import sys
+ platform_to_lib = {'darwin': '/opt/local/lib/libmagic.dylib',
+ 'win32': 'magic1.dll'}
+ if sys.platform in platform_to_lib:
try:
- libmagic = ctypes.CDLL(dll)
- break
+ libmagic = ctypes.CDLL(platform_to_lib[sys.platform])
except OSError:
pass
@@ -173,38 +132,13 @@ if not libmagic or not libmagic._name:
magic_t = ctypes.c_void_p
-def errorcheck_null(result, func, args):
- if result is None:
- err = magic_error(args[0])
+def errorcheck(result, func, args):
+ err = magic_error(args[0])
+ if err is not None:
raise MagicException(err)
else:
return result
-def errorcheck_negative_one(result, func, args):
- if result is -1:
- err = magic_error(args[0])
- raise MagicException(err)
- else:
- return result
-
-
-def coerce_filename(filename):
- if filename is None:
- return None
-
- # ctypes will implicitly convert unicode strings to bytes with
- # .encode('ascii'). If you use the filesystem encoding
- # then you'll get inconsistent behavior (crashes) depending on the user's
- # LANG environment variable
- is_unicode = (sys.version_info[0] <= 2 and
- isinstance(filename, unicode)) or \
- (sys.version_info[0] >= 3 and
- isinstance(filename, str))
- if is_unicode:
- return filename.encode('utf-8')
- else:
- return filename
-
magic_open = libmagic.magic_open
magic_open.restype = magic_t
magic_open.argtypes = [c_int]
@@ -221,30 +155,26 @@ magic_errno = libmagic.magic_errno
magic_errno.restype = c_int
magic_errno.argtypes = [magic_t]
-_magic_file = libmagic.magic_file
-_magic_file.restype = c_char_p
-_magic_file.argtypes = [magic_t, c_char_p]
-_magic_file.errcheck = errorcheck_null
+magic_file = libmagic.magic_file
+magic_file.restype = c_char_p
+magic_file.argtypes = [magic_t, c_char_p]
+magic_file.errcheck = errorcheck
-def magic_file(cookie, filename):
- return _magic_file(cookie, coerce_filename(filename))
_magic_buffer = libmagic.magic_buffer
_magic_buffer.restype = c_char_p
_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
-_magic_buffer.errcheck = errorcheck_null
+_magic_buffer.errcheck = errorcheck
+
def magic_buffer(cookie, buf):
return _magic_buffer(cookie, buf, len(buf))
-_magic_load = libmagic.magic_load
-_magic_load.restype = c_int
-_magic_load.argtypes = [magic_t, c_char_p]
-_magic_load.errcheck = errorcheck_negative_one
-
-def magic_load(cookie, filename):
- return _magic_load(cookie, coerce_filename(filename))
+magic_load = libmagic.magic_load
+magic_load.restype = c_int
+magic_load.argtypes = [magic_t, c_char_p]
+magic_load.errcheck = errorcheck
magic_setflags = libmagic.magic_setflags
magic_setflags.restype = c_int
diff --git a/libs/pytwmn.py b/libs/pytwmn.py
index 6b2d774..49661fb 100644
--- a/libs/pytwmn.py
+++ b/libs/pytwmn.py
@@ -45,8 +45,8 @@ def init(host="127.0.0.1", port=None):
class Notification(object):
def __init__(self, title="", msg="", icon=""):
- self.title = str(title)
- self.msg = str(msg)
+ self.title = unicode(title)
+ self.msg = unicode(msg)
if icon.startswith("file://"):
icon = icon[7:]
self.icon = icon
diff --git a/libs/sgmllib.py b/libs/sgmllib.py
deleted file mode 100644
index 88a02a3..0000000
--- a/libs/sgmllib.py
+++ /dev/null
@@ -1,547 +0,0 @@
-"""A parser for SGML, using the derived class as a static DTD."""
-
-# XXX This only supports those SGML features used by HTML.
-
-# XXX There should be a way to distinguish between PCDATA (parsed
-# character data -- the normal case), RCDATA (replaceable character
-# data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special). RCDATA is
-# not supported at all.
-
-import _markupbase
-import re
-
-__all__ = ["SGMLParser", "SGMLParseError"]
-
-# Regular expressions used for parsing
-
-interesting = re.compile('[&<]')
-incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
- '<([a-zA-Z][^<>]*|'
- '/([a-zA-Z][^<>]*)?|'
- '![^<>]*)?')
-
-entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
-charref = re.compile('([0-9]+)[^0-9]')
-
-starttagopen = re.compile('<[>a-zA-Z]')
-shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
-shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
-piclose = re.compile('>')
-endbracket = re.compile('[<>]')
-tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
-attrfind = re.compile(
- r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
-
-
-class SGMLParseError(RuntimeError):
- """Exception raised for all parse errors."""
- pass
-
-
-# SGML parser base class -- find tags and call handler functions.
-# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
-# The dtd is defined by deriving a class which defines methods
-# with special names to handle tags: start_foo and end_foo to handle
-# and , respectively, or do_foo to handle by itself.
-# (Tags are converted to lower case for this purpose.) The data
-# between tags is passed to the parser by calling self.handle_data()
-# with some data as argument (the data may be split up in arbitrary
-# chunks). Entity references are passed by calling
-# self.handle_entityref() with the entity reference as argument.
-
-class SGMLParser(_markupbase.ParserBase):
- # Definition of entities -- derived classes may override
- entity_or_charref = re.compile('&(?:'
- '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
- ')(;?)')
-
- def __init__(self, verbose=0):
- """Initialize and reset this instance."""
- self.verbose = verbose
- self.reset()
-
- def reset(self):
- """Reset this instance. Loses all unprocessed data."""
- self.__starttag_text = None
- self.rawdata = ''
- self.stack = []
- self.lasttag = '???'
- self.nomoretags = 0
- self.literal = 0
- _markupbase.ParserBase.reset(self)
-
- def setnomoretags(self):
- """Enter literal mode (CDATA) till EOF.
-
- Intended for derived classes only.
- """
- self.nomoretags = self.literal = 1
-
- def setliteral(self, *args):
- """Enter literal mode (CDATA).
-
- Intended for derived classes only.
- """
- self.literal = 1
-
- def feed(self, data):
- """Feed some data to the parser.
-
- Call this as often as you want, with as little or as much text
- as you want (may include '\n'). (This just saves the text,
- all the processing is done by goahead().)
- """
-
- self.rawdata = self.rawdata + data
- self.goahead(0)
-
- def close(self):
- """Handle the remaining data."""
- self.goahead(1)
-
- def error(self, message):
- raise SGMLParseError(message)
-
- # Internal -- handle data as far as reasonable. May leave state
- # and data to be processed by a subsequent call. If 'end' is
- # true, force handling all data as if followed by EOF marker.
- def goahead(self, end):
- rawdata = self.rawdata
- i = 0
- n = len(rawdata)
- while i < n:
- if self.nomoretags:
- self.handle_data(rawdata[i:n])
- i = n
- break
- match = interesting.search(rawdata, i)
- if match: j = match.start()
- else: j = n
- if i < j:
- self.handle_data(rawdata[i:j])
- i = j
- if i == n: break
- if rawdata[i] == '<':
- if starttagopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
- i = i+1
- continue
- k = self.parse_starttag(i)
- if k < 0: break
- i = k
- continue
- if rawdata.startswith("", i):
- k = self.parse_endtag(i)
- if k < 0: break
- i = k
- self.literal = 0
- continue
- if self.literal:
- if n > (i + 1):
- self.handle_data("<")
- i = i+1
- else:
- # incomplete
- break
- continue
- if rawdata.startswith(" send "%s"' % msg)
try:
self.socket.send(msg + bytes("\r\n", "ascii"))
- except socket.error as se:
+ except socket.error, se:
try: # a little dance of compatibility to get the errno
errno = se.errno
except AttributeError:
@@ -161,12 +160,12 @@ class IRCClient:
while not self._end:
try:
buffer += self.socket.recv(1024)
- except socket.timeout as e:
+ except socket.timeout, e:
if self._end:
break
logging.debug("timeout in client.py")
raise e
- except socket.error as e:
+ except socket.error, e:
if self._end:
break
logging.debug("error %s" % e)
@@ -196,16 +195,16 @@ class IRCClient:
pass
yield True
- except socket.timeout as se:
+ except socket.timeout, se:
logging.debug("passing timeout")
raise se
- except socket.error as se:
+ except socket.error, se:
logging.debug("problem: %s" % (se))
if self.socket:
logging.info('error: closing socket')
self.socket.close()
raise se
- except Exception as e:
+ except Exception, e:
logging.debug("other exception: %s" % e)
raise e
else:
@@ -254,7 +253,7 @@ class IRCApp:
garuntee the callback will be called after seconds has passed.
( the only advantage to these timers is they dont use threads )
"""
- assert isinstance(cb, collections.Callable)
+ assert callable(cb)
logging.info('added timer to call %s in %ss' % (cb, seconds))
self._timers.append((time.time() + seconds, cb))
@@ -265,13 +264,13 @@ class IRCApp:
while self.running:
found_one_alive = False
- for client, clientdesc in self._clients.items():
+ for client, clientdesc in self._clients.iteritems():
if clientdesc.con is None:
clientdesc.con = client.connect()
try:
- next(clientdesc.con)
- except Exception as e:
+ clientdesc.con.next()
+ except Exception, e:
logging.error('client error %s' % e)
logging.error(traceback.format_exc())
if clientdesc.autoreconnect:
diff --git a/oyoyo/cmdhandler.py b/oyoyo/cmdhandler.py
index a7a8a86..778020e 100644
--- a/oyoyo/cmdhandler.py
+++ b/oyoyo/cmdhandler.py
@@ -65,17 +65,13 @@ class CommandHandler(object):
its possible to pass both "command.sub.func" and
["command", "sub", "func"].
"""
- if isinstance(in_command_parts, (bytes)):
+ if isinstance(in_command_parts, (str, bytes)):
in_command_parts = in_command_parts.split(bytes('.', 'ascii'))
- elif isinstance(in_command_parts, (str)):
- in_command_parts = in_command_parts.split('.')
command_parts = in_command_parts[:]
p = self
while command_parts:
- cmd = command_parts.pop(0)
- if type(cmd) is bytes:
- cmd = cmd.decode('utf-8')
+ cmd = command_parts.pop(0).decode('ascii')
if cmd.startswith('_'):
raise ProtectedCommandError(in_command_parts)
@@ -109,7 +105,7 @@ class CommandHandler(object):
try:
f(*args)
- except Exception as e:
+ except Exception, e:
logging.error('command raised %s' % e)
logging.error(traceback.format_exc())
raise CommandError(command)
@@ -155,7 +151,7 @@ class DefaultBotCommandHandler(CommandHandler):
else:
try:
f = self.get(arg)
- except CommandError as e:
+ except CommandError, e:
helpers.msg(self.client, dest, str(e))
return
@@ -202,7 +198,7 @@ class BotCommandHandler(DefaultCommandHandler):
try:
self.command_handler.run(command, prefix, dest, *arg)
- except CommandError as e:
+ except CommandError, e:
helpers.msg(self.client, dest, str(e))
return True
diff --git a/oyoyo/examplebot.py b/oyoyo/examplebot.py
index dfd1885..81aac02 100644
--- a/oyoyo/examplebot.py
+++ b/oyoyo/examplebot.py
@@ -21,7 +21,7 @@ class MyHandler(DefaultCommandHandler):
match = re.match('\!say (.*)', msg)
if match:
to_say = match.group(1).strip()
- print(('Saying, "%s"' % to_say))
+ print('Saying, "%s"' % to_say)
helpers.msg(self.client, chan, to_say)
@@ -37,7 +37,7 @@ def main():
conn = cli.connect()
while True:
- next(conn) ## python 2
+ conn.next() ## python 2
# next(conn) ## python 3
diff --git a/oyoyo/helpers.py b/oyoyo/helpers.py
index 5c25b59..c82ec9c 100644
--- a/oyoyo/helpers.py
+++ b/oyoyo/helpers.py
@@ -111,7 +111,7 @@ def _addNumerics():
cli.send(cmd_num, *args)
return f
m = sys.modules[__name__]
- for num, name in ircevents.numeric_events.items():
+ for num, name in ircevents.numeric_events.iteritems():
setattr(m, name, numericcmd(num, name))
_addNumerics()
diff --git a/oyoyo/ircevents.py b/oyoyo/ircevents.py
index a1bda3c..6d8969b 100644
--- a/oyoyo/ircevents.py
+++ b/oyoyo/ircevents.py
@@ -179,8 +179,6 @@ numeric_events = {
"502": "usersdontmatch",
}
-numeric_events = {bytes(k, 'ascii'):v for k, v in numeric_events.items()}
-
generated_events = [
# Generated events
"dcc_connect",
@@ -208,5 +206,5 @@ protocol_events = [
"pong",
]
-all_events = generated_events + protocol_events + list(numeric_events.values())
+all_events = generated_events + protocol_events + numeric_events.values()
diff --git a/oyoyo/services.py b/oyoyo/services.py
index 751a787..9183beb 100644
--- a/oyoyo/services.py
+++ b/oyoyo/services.py
@@ -1,5 +1,5 @@
import sys
-from .helpers import msg
+from helpers import msg
# NickServ basic functions
_nickservfuncs = (
@@ -103,7 +103,7 @@ def _addServ(serv, funcs, prefix=""):
if prefix:
cmd_name = prefix.upper() + " " + cmd_name
def f(cli, *args):
- print(cmd_name, " ".join(args))
+ print cmd_name, " ".join(args)
#cli.send(cmd_name, serv.name, *args)
return f
for t in funcs:
diff --git a/parsetools.py b/parsetools.py
index c834020..4abceed 100644
--- a/parsetools.py
+++ b/parsetools.py
@@ -29,7 +29,7 @@ quirkloader = ScriptQuirks()
quirkloader.add(PythonQuirks())
quirkloader.add(LuaQuirks())
quirkloader.loadAll()
-print(quirkloader.funcre())
+print quirkloader.funcre()
_functionre = re.compile(r"%s" % quirkloader.funcre())
_groupre = re.compile(r"\\([0-9]+)")
@@ -44,7 +44,7 @@ def lexer(string, objlist):
for (oType, regexp) in objlist:
newstringlist = []
for (stri, s) in enumerate(stringlist):
- if type(s) not in [str]:
+ if type(s) not in [str, unicode]:
newstringlist.append(s)
continue
lasti = 0
@@ -207,9 +207,9 @@ def lexMessage(string):
(smiley, _smilere),
(honker, _honk)]
- string = str(string)
+ string = unicode(string)
string = string.replace("\n", " ").replace("\r", " ")
- lexed = lexer(str(string), lexlist)
+ lexed = lexer(unicode(string), lexlist)
balanced = []
beginc = 0
@@ -231,7 +231,7 @@ def lexMessage(string):
balanced.append(colorEnd(""))
if len(balanced) == 0:
balanced.append("")
- if type(balanced[len(balanced)-1]) not in [str]:
+ if type(balanced[len(balanced)-1]) not in [str, unicode]:
balanced.append("")
return balanced
@@ -239,12 +239,12 @@ def convertTags(lexed, format="html"):
if format not in ["html", "bbcode", "ctag", "text"]:
raise ValueError("Color format not recognized")
- if type(lexed) in [str]:
+ if type(lexed) in [str, unicode]:
lexed = lexMessage(lexed)
escaped = ""
firststr = True
for (i, o) in enumerate(lexed):
- if type(o) in [str]:
+ if type(o) in [str, unicode]:
if format == "html":
escaped += o.replace("&", "&").replace(">", ">").replace("<","<")
else:
@@ -259,7 +259,7 @@ def splitMessage(msg, format="ctag"):
# split long text lines
buf = []
for o in msg:
- if type(o) in [str] and len(o) > 200:
+ if type(o) in [str, unicode] and len(o) > 200:
for i in range(0, len(o), 200):
buf.append(o[i:i+200])
else:
@@ -401,7 +401,7 @@ def parseRegexpFunctions(to):
backr = _groupre.search(mo.group())
if backr is not None:
current.append(backreference(backr.group(1)))
- elif mo.group()[:-1] in list(functiondict.keys()):
+ elif mo.group()[:-1] in functiondict.keys():
p = parseLeaf(functiondict[mo.group()[:-1]], current)
current.append(p)
current = p
@@ -418,7 +418,7 @@ def parseRegexpFunctions(to):
def img2smiley(string):
- string = str(string)
+ string = unicode(string)
def imagerep(mo):
return reverse_smiley[mo.group(1)]
string = re.sub(r'
', imagerep, string)
@@ -499,8 +499,8 @@ if ostools.isOSXBundle():
-reverse_smiley = dict((v,k) for k, v in smiledict.items())
-_smilere = re.compile("|".join(list(smiledict.keys())))
+reverse_smiley = dict((v,k) for k, v in smiledict.iteritems())
+_smilere = re.compile("|".join(smiledict.keys()))
class ThemeException(Exception):
def __init__(self, value):
diff --git a/pesterchum.py b/pesterchum.py
index 115a66f..7bff675 100644
--- a/pesterchum.py
+++ b/pesterchum.py
@@ -9,22 +9,28 @@ from datetime import *
import random
import re
from time import time
-import threading, queue
+import threading, Queue
reqmissing = []
optmissing = []
try:
- from PyQt5 import QtGui, QtCore, QtWidgets, QtMultimedia
-except ImportError as e:
+ from PyQt5 import QtGui, QtCore, QtWidgets
+except ImportError, e:
module = str(e)
if module.startswith("No module named ") or \
module.startswith("cannot import name "):
reqmissing.append(module[module.rfind(" ")+1:])
- else: print(e)
-
+ else: print e
+try:
+ import pygame
+except ImportError, e:
+ pygame = None
+ module = str(e)
+ if module[:16] == "No module named ": optmissing.append(module[16:])
+ else: print e
if reqmissing:
- print("ERROR: The following modules are required for Pesterchum to run and are missing on your system:")
- for m in reqmissing: print("* "+m)
+ print "ERROR: The following modules are required for Pesterchum to run and are missing on your system:"
+ for m in reqmissing: print "* "+m
exit()
vnum = QtCore.qVersion()
major = int(vnum[:vnum.find(".")])
@@ -33,8 +39,8 @@ if vnum.find(".", vnum.find(".")+1) != -1:
else:
minor = int(vnum[vnum.find(".")+1:])
if not ((major > 4) or (major == 4 and minor >= 6)):
- print("ERROR: Pesterchum requires Qt version >= 4.6")
- print("You currently have version " + vnum + ". Please upgrade Qt")
+ print "ERROR: Pesterchum requires Qt version >= 4.6"
+ print "You currently have version " + vnum + ". Please upgrade Qt"
exit()
import ostools
@@ -107,7 +113,7 @@ class waitingMessageHolder(object):
def __init__(self, mainwindow, **msgfuncs):
self.mainwindow = mainwindow
self.funcs = msgfuncs
- self.queue = list(msgfuncs.keys())
+ self.queue = msgfuncs.keys()
if len(self.queue) > 0:
self.mainwindow.updateSystemTray()
def waitingHandles(self):
@@ -123,7 +129,7 @@ class waitingMessageHolder(object):
if len(self.queue) == 0:
self.mainwindow.updateSystemTray()
def addMessage(self, handle, func):
- if handle not in self.funcs:
+ if not self.funcs.has_key(handle):
self.queue.append(handle)
self.funcs[handle] = func
if len(self.queue) > 0:
@@ -276,13 +282,13 @@ class chumArea(RightClickTree):
@QtCore.pyqtSlot()
def beginNotify(self):
- print("BEGIN NOTIFY")
+ print "BEGIN NOTIFY"
self.notify = True
def getOptionsMenu(self):
if not self.currentItem():
return None
- text = str(self.currentItem().text(0))
+ text = unicode(self.currentItem().text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text == "Chums":
@@ -328,13 +334,13 @@ class chumArea(RightClickTree):
if thisitem.rfind(" (") != -1:
thisitem = thisitem[0:thisitem.rfind(" (")]
# Drop item is a group
- thisitem = str(event.source().currentItem().text(0))
+ thisitem = unicode(event.source().currentItem().text(0))
if thisitem.rfind(" (") != -1:
thisitem = thisitem[0:thisitem.rfind(" (")]
if thisitem == "Chums" or thisitem in self.groups:
droppos = self.itemAt(event.pos())
if not droppos: return
- droppos = str(droppos.text(0))
+ droppos = unicode(droppos.text(0))
if droppos.rfind(" ") != -1:
droppos = droppos[0:droppos.rfind(" ")]
if droppos == "Chums" or droppos in self.groups:
@@ -347,16 +353,16 @@ class chumArea(RightClickTree):
gTemp = []
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
- gTemp.append([str(text), self.topLevelItem(i).isExpanded()])
+ gTemp.append([unicode(text), self.topLevelItem(i).isExpanded()])
self.mainwindow.config.saveGroups(gTemp)
# Drop item is a chum
else:
item = self.itemAt(event.pos())
if item:
- text = str(item.text(0))
+ text = unicode(item.text(0))
# Figure out which group to drop into
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
@@ -364,7 +370,7 @@ class chumArea(RightClickTree):
group = text
gitem = item
else:
- ptext = str(item.parent().text(0))
+ ptext = unicode(item.parent().text(0))
if ptext.rfind(" ") != -1:
ptext = ptext[0:ptext.rfind(" ")]
group = ptext
@@ -387,7 +393,7 @@ class chumArea(RightClickTree):
if chums.index(thisitem) < inPos:
inPos -= 1
chums.remove(thisitem)
- chums.insert(inPos, str(thisitem))
+ chums.insert(inPos, unicode(thisitem))
self.mainwindow.config.setChums(chums)
else:
@@ -399,9 +405,9 @@ class chumArea(RightClickTree):
currentGroup = self.currentItem()
if currentGroup:
if currentGroup.parent():
- text = str(currentGroup.parent().text(0))
+ text = unicode(currentGroup.parent().text(0))
else:
- text = str(currentGroup.text(0))
+ text = unicode(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
currentGroup = text
@@ -459,7 +465,7 @@ class chumArea(RightClickTree):
return
curgroups = []
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
curgroups.append(text)
@@ -483,31 +489,31 @@ class chumArea(RightClickTree):
totals = {'Chums': 0}
online = {'Chums': 0}
for g in self.groups:
- totals[str(g)] = 0
- online[str(g)] = 0
+ totals[unicode(g)] = 0
+ online[unicode(g)] = 0
for c in self.chums:
yes = c.mood.name() != "offline"
if c.group == "Chums":
- totals[str(c.group)] = totals[str(c.group)]+1
+ totals[unicode(c.group)] = totals[unicode(c.group)]+1
if yes:
- online[str(c.group)] = online[str(c.group)]+1
+ online[unicode(c.group)] = online[unicode(c.group)]+1
elif c.group in totals:
- totals[str(c.group)] = totals[str(c.group)]+1
+ totals[unicode(c.group)] = totals[unicode(c.group)]+1
if yes:
- online[str(c.group)] = online[str(c.group)]+1
+ online[unicode(c.group)] = online[unicode(c.group)]+1
else:
totals["Chums"] = totals["Chums"]+1
if yes:
online["Chums"] = online["Chums"]+1
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text in online:
self.topLevelItem(i).setText(0, "%s (%i/%i)" % (text, online[text], totals[text]))
def hideOnlineNumbers(self):
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.topLevelItem(i).setText(0, "%s" % (text))
@@ -523,7 +529,7 @@ class chumArea(RightClickTree):
@QtCore.pyqtSlot()
def expandGroup(self):
item = self.currentItem()
- text = str(item.text(0))
+ text = unicode(item.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
@@ -538,7 +544,7 @@ class chumArea(RightClickTree):
self.mainwindow.config.addGroup("Chums")
curgroups = []
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
curgroups.append(text)
@@ -555,7 +561,7 @@ class chumArea(RightClickTree):
if self.openGroups[self.groups.index("%s" % (chumLabel.chum.group))]:
child_1.setExpanded(True)
for i in range(self.topLevelItemCount()):
- text = str(self.topLevelItem(i).text(0))
+ text = unicode(self.topLevelItem(i).text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text == chumLabel.chum.group:
@@ -574,7 +580,7 @@ class chumArea(RightClickTree):
bestname = ""
if fi > 0:
while not bestj:
- for j in range(self.topLevelItem(i).childCount()):
+ for j in xrange(self.topLevelItem(i).childCount()):
if chums[fi-c] == str(self.topLevelItem(i).child(j).text(0)):
bestj = j
bestname = chums[fi-c]
@@ -649,7 +655,7 @@ class chumArea(RightClickTree):
def initTheme(self, theme):
self.resize(*theme["main/chums/size"])
self.move(*theme["main/chums/loc"])
- if "main/chums/scrollbar" in theme:
+ if theme.has_key("main/chums/scrollbar"):
self.setStyleSheet("QListWidget { %s } QScrollBar { %s } QScrollBar::handle { %s } QScrollBar::add-line { %s } QScrollBar::sub-line { %s } QScrollBar:up-arrow { %s } QScrollBar:down-arrow { %s }" % (theme["main/chums/style"], theme["main/chums/scrollbar/style"], theme["main/chums/scrollbar/handle"], theme["main/chums/scrollbar/downarrow"], theme["main/chums/scrollbar/uparrow"], theme["main/chums/scrollbar/uarrowstyle"], theme["main/chums/scrollbar/darrowstyle"] ))
else:
self.setStyleSheet(theme["main/chums/style"])
@@ -757,7 +763,7 @@ class chumArea(RightClickTree):
return
(notes, ok) = QtWidgets.QInputDialog.getText(self, "Notes", "Enter your notes...")
if ok:
- notes = str(notes)
+ notes = unicode(notes)
self.mainwindow.chumdb.setNotes(currentChum.handle, notes)
currentChum.setToolTip(0, "%s: %s" % (currentChum.handle, notes))
@QtCore.pyqtSlot()
@@ -767,7 +773,7 @@ class chumArea(RightClickTree):
if not self.renamegroupdialog:
(gname, ok) = QtWidgets.QInputDialog.getText(self, "Rename Group", "Enter a new name for the group:")
if ok:
- gname = str(gname)
+ gname = unicode(gname)
if re.search("[^A-Za-z0-9_\s]", gname) is not None:
msgbox = QtWidgets.QMessageBox()
msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME")
@@ -781,7 +787,7 @@ class chumArea(RightClickTree):
index = self.indexOfTopLevelItem(currentGroup)
if index != -1:
expanded = currentGroup.isExpanded()
- text = str(currentGroup.text(0))
+ text = unicode(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.mainwindow.config.delGroup(text)
@@ -801,7 +807,7 @@ class chumArea(RightClickTree):
currentGroup = self.currentItem()
if not currentGroup:
return
- text = str(currentGroup.text(0))
+ text = unicode(currentGroup.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
self.mainwindow.config.delGroup(text)
@@ -824,7 +830,7 @@ class chumArea(RightClickTree):
def moveToGroup(self, item):
if not item:
return
- group = str(item.text())
+ group = unicode(item.text())
chumLabel = self.currentItem()
if not chumLabel:
return
@@ -939,7 +945,7 @@ class TrollSlumWindow(QtWidgets.QFrame):
self.addtrolldialog = QtWidgets.QInputDialog(self)
(handle, ok) = self.addtrolldialog.getText(self, "Add Troll", "Enter Troll Handle:")
if ok:
- handle = str(handle)
+ handle = unicode(handle)
if not (PesterProfile.checkLength(handle) and
PesterProfile.checkValid(handle)[0]):
errormsg = QtWidgets.QErrorMessage(self)
@@ -990,9 +996,8 @@ class PesterWindow(MovingWindow):
try:
themeChecker(self.theme)
- except ThemeException as xxx_todo_changeme:
- (inst) = xxx_todo_changeme
- print("Caught: "+inst.parameter)
+ except ThemeException, (inst):
+ print "Caught: "+inst.parameter
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (inst))
themeWarning.exec_()
@@ -1152,7 +1157,7 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def updatePC(self):
- version.updateDownload(str(self.updatemenu.url))
+ version.updateDownload(unicode(self.updatemenu.url))
self.updatemenu = None
@QtCore.pyqtSlot()
def noUpdatePC(self):
@@ -1208,7 +1213,7 @@ class PesterWindow(MovingWindow):
return
# notify
if self.config.notifyOptions() & self.config.NEWMSG:
- if handle not in self.convos:
+ if not self.convos.has_key(handle):
t = self.tm.Toast("New Conversation", "From: %s" % handle)
t.show()
elif not self.config.notifyOptions() & self.config.NEWCONVO:
@@ -1226,7 +1231,7 @@ class PesterWindow(MovingWindow):
elif msg == "PESTERCHUM:UNBLOCK":
t = self.tm.Toast("Unblocked", handle)
t.show()
- if handle not in self.convos:
+ if not self.convos.has_key(handle):
if msg == "PESTERCHUM:CEASE": # ignore cease after we hang up
return
matchingChums = [c for c in self.chumList.chums if c.handle == handle]
@@ -1248,12 +1253,12 @@ class PesterWindow(MovingWindow):
else:
self.alarm.play()
def newMemoMsg(self, chan, handle, msg):
- if chan not in self.memos:
+ if not self.memos.has_key(chan):
# silently ignore in case we forgot to /part
return
memo = self.memos[chan]
- msg = str(msg)
- if handle not in memo.times:
+ msg = unicode(msg)
+ if not memo.times.has_key(handle):
# new chum! time current
newtime = timedelta(0)
time = TimeTracker(newtime)
@@ -1291,19 +1296,19 @@ class PesterWindow(MovingWindow):
def changeColor(self, handle, color):
# pesterconvo and chumlist
self.chumList.updateColor(handle, color)
- if handle in self.convos:
+ if self.convos.has_key(handle):
self.convos[handle].updateColor(color)
self.chumdb.setColor(handle, color)
def updateMood(self, handle, mood):
# updates OTHER chums' moods
oldmood = self.chumList.updateMood(handle, mood)
- if handle in self.convos:
+ if self.convos.has_key(handle):
self.convos[handle].updateMood(mood, old=oldmood)
if hasattr(self, 'trollslum') and self.trollslum:
self.trollslum.updateMood(handle, mood)
def newConversation(self, chum, initiated=True):
- if type(chum) in [str, str]:
+ if type(chum) in [str, unicode]:
matchingChums = [c for c in self.chumList.chums if c.handle == chum]
if len(matchingChums) > 0:
mood = matchingChums[0].mood
@@ -1313,7 +1318,7 @@ class PesterWindow(MovingWindow):
if len(matchingChums) == 0:
self.moodRequest.emit(chum)
- if chum.handle in self.convos:
+ if self.convos.has_key(chum.handle):
self.convos[chum.handle].showChat()
return
if self.config.tabs():
@@ -1326,10 +1331,10 @@ class PesterWindow(MovingWindow):
convoWindow.messageSent.connect(self.sendMessage)
convoWindow.windowClosed.connect(self.closeConvo)
self.convos[chum.handle] = convoWindow
- if str(chum.handle).upper() in BOTNAMES:
+ if unicode(chum.handle).upper() in BOTNAMES:
convoWindow.toggleQuirks(True)
convoWindow.quirksOff.setChecked(True)
- if str(chum.handle).upper() in CUSTOMBOTS:
+ if unicode(chum.handle).upper() in CUSTOMBOTS:
self.newConvoStarted.emit(chum.handle, initiated)
else:
self.newConvoStarted.emit(chum.handle, initiated)
@@ -1345,7 +1350,7 @@ class PesterWindow(MovingWindow):
def newMemo(self, channel, timestr, secret=False, invite=False):
if channel == "#pesterchum":
return
- if channel in self.memos:
+ if self.memos.has_key(channel):
self.memos[channel].showChat()
return
# do slider dialog then set
@@ -1460,19 +1465,19 @@ class PesterWindow(MovingWindow):
if hasattr(self, 'moods'):
self.moods.removeButtons()
mood_list = theme["main/moods"]
- mood_list = [dict([(str(k),v) for (k,v) in d.items()])
+ mood_list = [dict([(str(k),v) for (k,v) in d.iteritems()])
for d in mood_list]
self.moods = PesterMoodHandler(self, *[PesterMoodButton(self, **d) for d in mood_list])
self.moods.showButtons()
# chum
addChumStyle = "QPushButton { %s }" % (theme["main/addchum/style"])
- if "main/addchum/pressed" in theme:
+ if theme.has_key("main/addchum/pressed"):
addChumStyle += "QPushButton:pressed { %s }" % (theme["main/addchum/pressed"])
pesterButtonStyle = "QPushButton { %s }" % (theme["main/pester/style"])
- if "main/pester/pressed" in theme:
+ if theme.has_key("main/pester/pressed"):
pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/pester/pressed"])
blockButtonStyle = "QPushButton { %s }" % (theme["main/block/style"])
- if "main/block/pressed" in theme:
+ if theme.has_key("main/block/pressed"):
pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/block/pressed"])
self.addChumButton.setText(theme["main/addchum/text"])
self.addChumButton.resize(*theme["main/addchum/size"])
@@ -1497,7 +1502,7 @@ class PesterWindow(MovingWindow):
self.mychumcolor.resize(*theme["main/mychumhandle/colorswatch/size"])
self.mychumcolor.move(*theme["main/mychumhandle/colorswatch/loc"])
self.mychumcolor.setStyleSheet("background: %s" % (self.profile().colorhtml()))
- if "main/mychumhandle/currentMood" in self.theme:
+ if self.theme.has_key("main/mychumhandle/currentMood"):
moodicon = self.profile().mood.icon(theme)
if hasattr(self, 'currentMoodIcon') and self.currentMoodIcon:
self.currentMoodIcon.hide()
@@ -1518,36 +1523,40 @@ class PesterWindow(MovingWindow):
self.mychumcolor.setText("")
# sounds
- try:
- self.alarm, self.memosound, self.namesound, self.ceasesound, self.honksound = \
- [QtMultimedia.QSoundEffect() for i in range(5)]
- self.alarm.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/alertsound"]))
- self.memosound.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/memosound"]))
- self.namesound.setSource(QtCore.QUrl.fromLocalFile("themes/namealarm.wav"))
- self.ceasesound.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/ceasesound"]))
- self.honksound.setSource(QtCore.QUrl.fromLocalFile("themes/honk.wav"))
- except Exception as e:
+ if not pygame or not pygame.mixer:
self.alarm = NoneSound()
self.memosound = NoneSound()
self.namesound = NoneSound()
self.ceasesound = NoneSound()
self.honksound = NoneSound()
+ else:
+ try:
+ self.alarm = pygame.mixer.Sound(theme["main/sounds/alertsound"])
+ self.memosound = pygame.mixer.Sound(theme["main/sounds/memosound"])
+ self.namesound = pygame.mixer.Sound("themes/namealarm.wav")
+ self.ceasesound = pygame.mixer.Sound(theme["main/sounds/ceasesound"])
+ self.honksound = pygame.mixer.Sound("themes/honk.wav")
+ except Exception, e:
+ self.alarm = NoneSound()
+ self.memosound = NoneSound()
+ self.namesound = NoneSound()
+ self.ceasesound = NoneSound()
+ self.honksound = NoneSound()
self.setVolume(self.config.volume())
def setVolume(self, vol):
vol = vol/100.0
- self.alarm.setVolume(vol)
- self.memosound.setVolume(vol)
- self.namesound.setVolume(vol)
- self.ceasesound.setVolume(vol)
- self.honksound.setVolume(vol)
+ self.alarm.set_volume(vol)
+ self.memosound.set_volume(vol)
+ self.namesound.set_volume(vol)
+ self.ceasesound.set_volume(vol)
+ self.honksound.set_volume(vol)
def changeTheme(self, theme):
# check theme
try:
themeChecker(theme)
- except ThemeException as xxx_todo_changeme1:
- (inst) = xxx_todo_changeme1
+ except ThemeException, (inst):
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (inst))
themeWarning.exec_()
@@ -1630,7 +1639,7 @@ class PesterWindow(MovingWindow):
def pesterSelectedChum(self):
curChum = self.chumList.currentItem()
if curChum:
- text = str(curChum.text(0))
+ text = unicode(curChum.text(0))
if text.rfind(" (") != -1:
text = text[0:text.rfind(" (")]
if text not in self.chumList.groups and \
@@ -1646,7 +1655,7 @@ class PesterWindow(MovingWindow):
self.newConversation(chum)
@QtCore.pyqtSlot('QString')
def closeConvo(self, handle):
- h = str(handle)
+ h = unicode(handle)
try:
chum = self.convos[h].chum
except KeyError:
@@ -1662,7 +1671,7 @@ class PesterWindow(MovingWindow):
del self.convos[h]
@QtCore.pyqtSlot('QString')
def closeMemo(self, channel):
- c = str(channel)
+ c = unicode(channel)
self.chatlog.finish(c)
self.leftChannel.emit(channel)
try:
@@ -1680,27 +1689,27 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString', Mood)
def updateMoodSlot(self, handle, mood):
- h = str(handle)
+ h = unicode(handle)
self.updateMood(h, mood)
@QtCore.pyqtSlot('QString', QtGui.QColor)
def updateColorSlot(self, handle, color):
- h = str(handle)
+ h = unicode(handle)
self.changeColor(h, color)
@QtCore.pyqtSlot('QString', 'QString')
def deliverMessage(self, handle, msg):
- h = str(handle)
- m = str(msg)
+ h = unicode(handle)
+ m = unicode(msg)
self.newMessage(h, m)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def deliverMemo(self, chan, handle, msg):
- (c, h, m) = (str(chan), str(handle), str(msg))
+ (c, h, m) = (unicode(chan), unicode(handle), unicode(msg))
self.newMemoMsg(c,h,m)
@QtCore.pyqtSlot('QString', 'QString')
def deliverNotice(self, handle, msg):
- h = str(handle)
- m = str(msg)
+ h = unicode(handle)
+ m = unicode(msg)
if m.startswith("Your nickname is now being changed to"):
changedto = m[39:-1]
msgbox = QtWidgets.QMessageBox()
@@ -1710,7 +1719,7 @@ class PesterWindow(MovingWindow):
ret = msgbox.exec_()
elif h == self.randhandler.randNick:
self.randhandler.incoming(msg)
- elif h in self.convos:
+ elif self.convos.has_key(h):
self.newMessage(h, m)
elif h.upper() == "NICKSERV" and "PESTERCHUM:" not in m:
m = nickservmsgs.translate(m)
@@ -1725,7 +1734,7 @@ class PesterWindow(MovingWindow):
msgbox.setStandardButtons(QtWidgets.QMessageBox.Ok | QtWidgets.QMessageBox.Cancel)
ret = msgbox.exec_()
if ret == QtWidgets.QMessageBox.Ok:
- self.newMemo(str(channel), "+0:00")
+ self.newMemo(unicode(channel), "+0:00")
@QtCore.pyqtSlot('QString')
def chanInviteOnly(self, channel):
self.inviteOnlyChan.emit(channel)
@@ -1737,35 +1746,35 @@ class PesterWindow(MovingWindow):
self.modesUpdated.emit(channel, modes)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def timeCommand(self, chan, handle, command):
- (c, h, cmd) = (str(chan), str(handle), str(command))
+ (c, h, cmd) = (unicode(chan), unicode(handle), unicode(command))
if self.memos[c]:
self.memos[c].timeUpdate(h, cmd)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def quirkDisable(self, channel, msg, op):
- (c, msg, op) = (str(channel), str(msg), str(op))
- if c not in self.memos:
+ (c, msg, op) = (unicode(channel), unicode(msg), unicode(op))
+ if not self.memos.has_key(c):
return
memo = self.memos[c]
memo.quirkDisable(op, msg)
@QtCore.pyqtSlot('QString', PesterList)
def updateNames(self, channel, names):
- c = str(channel)
+ c = unicode(channel)
# update name DB
self.namesdb[c] = names
# warn interested party of names
self.namesUpdated.emit(c)
@QtCore.pyqtSlot('QString', 'QString', 'QString')
def userPresentUpdate(self, handle, channel, update):
- c = str(channel)
- n = str(handle)
+ c = unicode(channel)
+ n = unicode(handle)
if update == "nick":
l = n.split(":")
oldnick = l[0]
newnick = l[1]
if update in ("quit", "netsplit"):
- for c in list(self.namesdb.keys()):
+ for c in self.namesdb.keys():
try:
i = self.namesdb[c].index(n)
self.namesdb[c].pop(i)
@@ -1782,7 +1791,7 @@ class PesterWindow(MovingWindow):
except KeyError:
self.namesdb[c] = []
elif update == "nick":
- for c in list(self.namesdb.keys()):
+ for c in self.namesdb.keys():
try:
i = self.namesdb[c].index(oldnick)
self.namesdb[c].pop(i)
@@ -1809,12 +1818,12 @@ class PesterWindow(MovingWindow):
available_groups = [g[0] for g in self.config.getGroups()]
self.addchumdialog = AddChumDialog(available_groups, self)
ok = self.addchumdialog.exec_()
- handle = str(self.addchumdialog.chumBox.text()).strip()
- newgroup = str(self.addchumdialog.newgroup.text()).strip()
+ handle = unicode(self.addchumdialog.chumBox.text()).strip()
+ newgroup = unicode(self.addchumdialog.newgroup.text()).strip()
selectedGroup = self.addchumdialog.groupBox.currentText()
group = newgroup if newgroup else selectedGroup
if ok:
- handle = str(handle)
+ handle = unicode(handle)
if handle in [h.handle for h in self.chumList.chums]:
self.addchumdialog = None
return
@@ -1846,10 +1855,10 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def blockChum(self, handle):
- h = str(handle)
+ h = unicode(handle)
self.config.addBlocklist(h)
self.config.removeChum(h)
- if h in self.convos:
+ if self.convos.has_key(h):
convo = self.convos[h]
msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/blocked"])
convo.textArea.append(convertTags(msg))
@@ -1864,9 +1873,9 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def unblockChum(self, handle):
- h = str(handle)
+ h = unicode(handle)
self.config.delBlocklist(h)
- if h in self.convos:
+ if self.convos.has_key(h):
convo = self.convos[h]
msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/unblocked"])
convo.textArea.append(convertTags(msg))
@@ -1887,7 +1896,7 @@ class PesterWindow(MovingWindow):
self.randhandler.setIdle(True)
sysColor = QtGui.QColor(self.theme["convo/systemMsgColor"])
verb = self.theme["convo/text/idle"]
- for (h, convo) in self.convos.items():
+ for (h, convo) in self.convos.iteritems():
if convo.chumopen:
msg = self.profile().idlemsg(sysColor, verb)
convo.textArea.append(convertTags(msg))
@@ -1921,7 +1930,7 @@ class PesterWindow(MovingWindow):
return
fp = open(f, 'r')
regexp_state = None
- for l in fp:
+ for l in fp.xreadlines():
# import chumlist
l = l.rstrip()
chum_mo = re.match("handle: ([A-Za-z0-9]+)", l)
@@ -1935,7 +1944,7 @@ class PesterWindow(MovingWindow):
replace = replace_mo.group(1)
try:
re.compile(regexp_state)
- except re.error as e:
+ except re.error, e:
continue
newquirk = pesterQuirk({"type": "regexp",
"from": regexp_state,
@@ -1971,18 +1980,18 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def joinSelectedMemo(self):
- time = str(self.memochooser.timeinput.text())
+ time = unicode(self.memochooser.timeinput.text())
secret = self.memochooser.secretChannel.isChecked()
invite = self.memochooser.inviteChannel.isChecked()
if self.memochooser.newmemoname():
newmemo = self.memochooser.newmemoname()
- channel = "#"+str(newmemo).replace(" ", "_")
+ channel = "#"+unicode(newmemo).replace(" ", "_")
channel = re.sub(r"[^A-Za-z0-9#_]", "", channel)
self.newMemo(channel, time, secret=secret, invite=invite)
for SelectedMemo in self.memochooser.SelectedMemos():
- channel = "#"+str(SelectedMemo.target)
+ channel = "#"+unicode(SelectedMemo.target)
self.newMemo(channel, time)
self.memochooser = None
@@ -2009,12 +2018,12 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot('QString')
def userListAdd(self, handle):
- h = str(handle)
+ h = unicode(handle)
chum = PesterProfile(h, chumdb=self.chumdb)
self.addChum(chum)
@QtCore.pyqtSlot('QString')
def userListPester(self, handle):
- h = str(handle)
+ h = unicode(handle)
self.newConversation(h)
@QtCore.pyqtSlot()
def userListClose(self):
@@ -2034,7 +2043,7 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def updateQuirks(self):
for i in range(self.quirkmenu.quirkList.topLevelItemCount()):
- curgroup = str(self.quirkmenu.quirkList.topLevelItem(i).text(0))
+ curgroup = unicode(self.quirkmenu.quirkList.topLevelItem(i).text(0))
for j in range(self.quirkmenu.quirkList.topLevelItem(i).childCount()):
item = self.quirkmenu.quirkList.topLevelItem(i).child(j)
item.quirk.quirk["on"] = item.quirk.on = (item.checkState(0) == QtCore.Qt.Checked)
@@ -2057,7 +2066,7 @@ class PesterWindow(MovingWindow):
(chum, ok) = QtWidgets.QInputDialog.getText(self, "Pester Chum", "Enter a handle to pester:")
try:
if ok:
- self.newConversation(str(chum))
+ self.newConversation(unicode(chum))
except:
pass
finally:
@@ -2085,7 +2094,7 @@ class PesterWindow(MovingWindow):
if not self.addgroupdialog:
(gname, ok) = QtWidgets.QInputDialog.getText(self, "Add Group", "Enter a name for the new group:")
if ok:
- gname = str(gname)
+ gname = unicode(gname)
if re.search("[^A-Za-z0-9_\s]", gname) is not None:
msgbox = QtWidgets.QMessageBox()
msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME")
@@ -2135,7 +2144,7 @@ class PesterWindow(MovingWindow):
# combine
self.createTabWindow()
newconvos = {}
- for (h,c) in self.convos.items():
+ for (h,c) in self.convos.iteritems():
c.setParent(self.tabconvo)
self.tabconvo.addChat(c)
self.tabconvo.show()
@@ -2165,7 +2174,7 @@ class PesterWindow(MovingWindow):
# combine
newmemos = {}
self.createMemoTabWindow()
- for (h,m) in self.memos.items():
+ for (h,m) in self.memos.iteritems():
m.setParent(self.tabmemo)
self.tabmemo.addChat(m)
self.tabmemo.show()
@@ -2214,7 +2223,7 @@ class PesterWindow(MovingWindow):
# timestamps
timestampsetting = self.optionmenu.timestampcheck.isChecked()
self.config.set("showTimeStamps", timestampsetting)
- timeformatsetting = str(self.optionmenu.timestampBox.currentText())
+ timeformatsetting = unicode(self.optionmenu.timestampBox.currentText())
if timeformatsetting == "12 hour":
self.config.set("time12Format", True)
else:
@@ -2324,7 +2333,7 @@ class PesterWindow(MovingWindow):
self.config.set('blink', blinksetting)
# toast notifications
self.tm.setEnabled(self.optionmenu.notifycheck.isChecked())
- self.tm.setCurrentType(str(self.optionmenu.notifyOptions.currentText()))
+ self.tm.setCurrentType(unicode(self.optionmenu.notifyOptions.currentText()))
notifysetting = 0
if self.optionmenu.notifySigninCheck.isChecked():
notifysetting |= self.config.SIGNIN
@@ -2364,7 +2373,7 @@ class PesterWindow(MovingWindow):
newmodes = self.optionmenu.modechange.text()
if newmodes:
self.setChannelMode.emit(self.profile().handle, newmodes, "")
- except Exception as e:
+ except Exception, e:
logging.error(e)
finally:
self.optionmenu = None
@@ -2391,13 +2400,13 @@ class PesterWindow(MovingWindow):
@QtCore.pyqtSlot()
def themeSelected(self, override=False):
if not override:
- themename = str(self.optionmenu.themeBox.currentText())
+ themename = unicode(self.optionmenu.themeBox.currentText())
else:
themename = override
if override or themename != self.theme.name:
try:
self.changeTheme(pesterTheme(themename))
- except ValueError as e:
+ except ValueError, e:
themeWarning = QtWidgets.QMessageBox(self)
themeWarning.setText("Theme Error: %s" % (e))
themeWarning.exec_()
@@ -2413,14 +2422,14 @@ class PesterWindow(MovingWindow):
def profileSelected(self):
if self.chooseprofile.profileBox and \
self.chooseprofile.profileBox.currentIndex() > 0:
- handle = str(self.chooseprofile.profileBox.currentText())
+ handle = unicode(self.chooseprofile.profileBox.currentText())
if handle == self.profile().handle:
self.chooseprofile = None
return
self.userprofile = userProfile(handle)
self.changeTheme(self.userprofile.getTheme())
else:
- handle = str(self.chooseprofile.chumHandle.text())
+ handle = unicode(self.chooseprofile.chumHandle.text())
if handle == self.profile().handle:
self.chooseprofile = None
return
@@ -2519,7 +2528,7 @@ class PesterWindow(MovingWindow):
if not hasattr(self, 'chooseprofile'):
self.chooseprofile = None
if not self.chooseprofile:
- h = str(handle)
+ h = unicode(handle)
self.changeProfile(collision=h)
@QtCore.pyqtSlot('QString')
def myHandleChanged(self, handle):
@@ -2607,6 +2616,15 @@ class MainProgram(QtCore.QObject):
options = self.oppts(sys.argv[1:])
+ if pygame and pygame.mixer:
+ # we could set the frequency higher but i love how cheesy it sounds
+ try:
+ pygame.mixer.init()
+ pygame.mixer.init()
+ except pygame.error, e:
+ print "Warning: No sound! %s" % (e)
+ else:
+ print "Warning: No sound!"
self.widget = PesterWindow(options, app=self.app)
self.widget.show()
@@ -2664,7 +2682,7 @@ class MainProgram(QtCore.QObject):
@QtCore.pyqtSlot()
def runUpdateSlot(self):
- q = queue.Queue(1)
+ q = Queue.Queue(1)
s = threading.Thread(target=version.updateCheck, args=(q,))
w = threading.Thread(target=self.showUpdate, args=(q,))
w.start()
@@ -2795,7 +2813,7 @@ Click this message to never see this again.")
for c in self.widget.tabmemo.convos:
self.irc.joinChannel(c)
else:
- for c in list(self.widget.memos.values()):
+ for c in self.widget.memos.values():
self.irc.joinChannel(c.channel)
return True
diff --git a/profile.py b/profile.py
index 6d5ef1e..64b6d28 100644
--- a/profile.py
+++ b/profile.py
@@ -41,17 +41,17 @@ class PesterLog(object):
if not self.parent.config.logPesters() & self.parent.config.LOG: return
if not self.parent.config.logPesters() & self.parent.config.STAMP:
time = ""
- if str(handle).upper() == "NICKSERV": return
+ if unicode(handle).upper() == "NICKSERV": return
#watch out for illegal characters
handle = re.sub(r'[<>:"/\\|?*]', "_", handle)
bbcodemsg = time + convertTags(msg, "bbcode")
html = time + convertTags(msg, "html")+"
"
msg = time +convertTags(msg, "text")
modes = {"bbcode": bbcodemsg, "html": html, "text": msg}
- if handle not in self.convos:
+ if not self.convos.has_key(handle):
time = datetime.now().strftime("%Y-%m-%d.%H.%M")
self.convos[handle] = {}
- for (format, t) in modes.items():
+ for (format, t) in modes.iteritems():
if not os.path.exists("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format)):
os.makedirs("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format))
try:
@@ -63,7 +63,7 @@ class PesterLog(object):
errmsg.show()
continue
self.convos[handle][format] = fp
- for (format, t) in modes.items():
+ for (format, t) in modes.iteritems():
f = self.convos[handle][format]
if platform.system() == "Windows":
f.write(t+"\r\n")
@@ -71,14 +71,14 @@ class PesterLog(object):
f.write(t+"\r\n")
f.flush()
def finish(self, handle):
- if handle not in self.convos:
+ if not self.convos.has_key(handle):
return
- for f in list(self.convos[handle].values()):
+ for f in self.convos[handle].values():
f.close()
del self.convos[handle]
def close(self):
- for h in list(self.convos.keys()):
- for f in list(self.convos[h].values()):
+ for h in self.convos.keys():
+ for f in self.convos[h].values():
f.close()
class userConfig(object):
@@ -100,7 +100,7 @@ class userConfig(object):
fp = open(self.filename)
self.config = json.load(fp)
fp.close()
- if "defaultprofile" in self.config:
+ if self.config.has_key("defaultprofile"):
self.userprofile = userProfile(self.config["defaultprofile"])
else:
self.userprofile = None
@@ -125,7 +125,7 @@ class userConfig(object):
fp.close()
def chums(self):
- if 'chums' not in self.config:
+ if not self.config.has_key('chums'):
self.set("chums", [])
return self.config.get('chums', [])
def setChums(self, newchums):
@@ -148,19 +148,19 @@ class userConfig(object):
def tabs(self):
return self.config.get("tabs", True)
def tabMemos(self):
- if 'tabmemos' not in self.config:
+ if not self.config.has_key('tabmemos'):
self.set("tabmemos", self.tabs())
return self.config.get("tabmemos", True)
def showTimeStamps(self):
- if 'showTimeStamps' not in self.config:
+ if not self.config.has_key('showTimeStamps'):
self.set("showTimeStamps", True)
return self.config.get('showTimeStamps', True)
def time12Format(self):
- if 'time12Format' not in self.config:
+ if not self.config.has_key('time12Format'):
self.set("time12Format", True)
return self.config.get('time12Format', True)
def showSeconds(self):
- if 'showSeconds' not in self.config:
+ if not self.config.has_key('showSeconds'):
self.set("showSeconds", False)
return self.config.get('showSeconds', False)
def sortMethod(self):
@@ -174,11 +174,11 @@ class userConfig(object):
return g[1]
return True
def showEmptyGroups(self):
- if 'emptyGroups' not in self.config:
+ if not self.config.has_key('emptyGroups'):
self.set("emptyGroups", False)
return self.config.get('emptyGroups', False)
def showOnlineNumbers(self):
- if 'onlineNumbers' not in self.config:
+ if not self.config.has_key('onlineNumbers'):
self.set("onlineNumbers", False)
return self.config.get('onlineNumbers', False)
def logPesters(self):
@@ -238,7 +238,7 @@ class userConfig(object):
newchums = [c for c in self.config['chums'] if c != handle]
self.set("chums", newchums)
def getBlocklist(self):
- if 'block' not in self.config:
+ if not self.config.has_key('block'):
self.set('block', [])
return self.config['block']
def addBlocklist(self, handle):
@@ -251,7 +251,7 @@ class userConfig(object):
l.pop(l.index(handle))
self.set('block', l)
def getGroups(self):
- if 'groups' not in self.groups:
+ if not self.groups.has_key('groups'):
self.saveGroups([["Chums", True]])
return self.groups.get('groups', [["Chums", True]])
def addGroup(self, group, open=True):
@@ -285,7 +285,7 @@ class userConfig(object):
self.groups['groups'] = groups
try:
jsonoutput = json.dumps(self.groups)
- except ValueError as e:
+ except ValueError, e:
raise e
fp = open("%s/groups.js" % (self.logpath), 'w')
fp.write(jsonoutput)
@@ -300,7 +300,7 @@ class userConfig(object):
return self.parent.portOverride
return self.config.get('port', '6667')
def soundOn(self):
- if 'soundon' not in self.config:
+ if not self.config.has_key('soundon'):
self.set('soundon', True)
return self.config['soundon']
def chatSound(self):
@@ -319,7 +319,7 @@ class userConfig(object):
self.config[item] = setting
try:
jsonoutput = json.dumps(self.config)
- except ValueError as e:
+ except ValueError, e:
raise e
fp = open(self.filename, 'w')
fp.write(jsonoutput)
@@ -356,7 +356,7 @@ class userProfile(object):
if type(user) is PesterProfile:
self.chat = user
self.userprofile = {"handle":user.handle,
- "color": str(user.color.name()),
+ "color": unicode(user.color.name()),
"quirks": [],
"theme": "pesterchum"}
self.theme = pesterTheme("pesterchum")
@@ -377,7 +377,7 @@ class userProfile(object):
fp.close()
try:
self.theme = pesterTheme(self.userprofile["theme"])
- except ValueError as e:
+ except ValueError, e:
self.theme = pesterTheme("pesterchum")
self.lastmood = self.userprofile.get('lastmood', self.theme["main/defaultmood"])
self.chat = PesterProfile(self.userprofile["handle"],
@@ -402,7 +402,7 @@ class userProfile(object):
try:
with open(_datadir+"passwd.js") as fp:
self.passwd = json.load(fp)
- except Exception as e:
+ except Exception, e:
self.passwd = {}
self.autoidentify = False
self.nickservpass = ""
@@ -418,7 +418,7 @@ class userProfile(object):
self.save()
def setColor(self, color):
self.chat.color = color
- self.userprofile["color"] = str(color.name())
+ self.userprofile["color"] = unicode(color.name())
self.save()
def setQuirks(self, quirks):
self.quirks = quirks
@@ -436,7 +436,7 @@ class userProfile(object):
try:
for (i,m) in enumerate(mentions):
re.compile(m)
- except re.error as e:
+ except re.error, e:
logging.error("#%s Not a valid regular expression: %s" % (i, e))
else:
self.mentions = mentions
@@ -479,19 +479,19 @@ class userProfile(object):
return
try:
jsonoutput = json.dumps(self.userprofile)
- except ValueError as e:
+ except ValueError, e:
raise e
fp = open("%s/%s.js" % (self.profiledir, handle), 'w')
fp.write(jsonoutput)
fp.close()
def saveNickServPass(self):
# remove profiles with no passwords
- for h,t in list(self.passwd.items()):
+ for h,t in self.passwd.items():
if "auto" not in t or "pw" not in t or t["pw"] == "":
del self.passwd[h]
try:
jsonoutput = json.dumps(self.passwd, indent=4)
- except ValueError as e:
+ except ValueError, e:
raise e
with open(_datadir+"passwd.js", 'w') as fp:
fp.write(jsonoutput)
@@ -526,7 +526,7 @@ class PesterProfileDB(dict):
fp.close()
u = []
- for (handle, c) in chumdict.items():
+ for (handle, c) in chumdict.iteritems():
options = dict()
if 'group' in c:
options['group'] = c['group']
@@ -543,39 +543,39 @@ class PesterProfileDB(dict):
def save(self):
try:
fp = open("%s/chums.js" % (self.logpath), 'w')
- chumdict = dict([p.plaindict() for p in self.values()])
+ chumdict = dict([p.plaindict() for p in self.itervalues()])
json.dump(chumdict, fp)
fp.close()
- except Exception as e:
+ except Exception, e:
raise e
def getColor(self, handle, default=None):
- if handle not in self:
+ if not self.has_key(handle):
return default
else:
return self[handle].color
def setColor(self, handle, color):
- if handle in self:
+ if self.has_key(handle):
self[handle].color = color
else:
self[handle] = PesterProfile(handle, color)
def getGroup(self, handle, default="Chums"):
- if handle not in self:
+ if not self.has_key(handle):
return default
else:
return self[handle].group
def setGroup(self, handle, theGroup):
- if handle in self:
+ if self.has_key(handle):
self[handle].group = theGroup
else:
self[handle] = PesterProfile(handle, group=theGroup)
self.save()
def getNotes(self, handle, default=""):
- if handle not in self:
+ if not self.has_key(handle):
return default
else:
return self[handle].notes
def setNotes(self, handle, notes):
- if handle in self:
+ if self.has_key(handle):
self[handle].notes = notes
else:
self[handle] = PesterProfile(handle, notes=notes)
@@ -604,7 +604,7 @@ class pesterTheme(dict):
except IOError:
theme = json.loads("{}")
self.update(theme)
- if "inherits" in self:
+ if self.has_key("inherits"):
self.inheritedTheme = pesterTheme(self["inherits"])
if not default:
self.defaultTheme = pesterTheme("pesterchum", default=True)
@@ -612,7 +612,7 @@ class pesterTheme(dict):
keys = key.split("/")
try:
v = dict.__getitem__(self, keys.pop(0))
- except KeyError as e:
+ except KeyError, e:
if hasattr(self, 'inheritedTheme'):
return self.inheritedTheme[key]
if hasattr(self, 'defaultTheme'):
@@ -622,7 +622,7 @@ class pesterTheme(dict):
for k in keys:
try:
v = v[k]
- except KeyError as e:
+ except KeyError, e:
if hasattr(self, 'inheritedTheme'):
return self.inheritedTheme[key]
if hasattr(self, 'defaultTheme'):
@@ -631,8 +631,8 @@ class pesterTheme(dict):
raise e
return v
def pathHook(self, d):
- for (k, v) in d.items():
- if type(v) is str:
+ for (k, v) in d.iteritems():
+ if type(v) is unicode:
s = Template(v)
d[k] = s.safe_substitute(path=self.path)
return d
@@ -658,6 +658,6 @@ class pesterTheme(dict):
return False if v is None else True
except KeyError:
if hasattr(self, 'inheritedTheme'):
- return key in self.inheritedTheme
+ return self.inheritedTheme.has_key(key)
else:
return False
diff --git a/pyquirks.py b/pyquirks.py
index e275cc5..f4a5b37 100644
--- a/pyquirks.py
+++ b/pyquirks.py
@@ -12,20 +12,20 @@ class PythonQuirks(ScriptQuirks):
def modHas(self, module, attr):
if attr == 'commands':
variables = vars(module)
- for name, obj in variables.items():
+ for name, obj in variables.iteritems():
if self.modHas(obj, 'command'):
return True
return hasattr(module, attr)
def register(self, module):
variables = vars(module)
- for name, obj in variables.items():
+ for name, obj in variables.iteritems():
if self.modHas(obj, 'command'):
try:
- if not isinstance(obj("test"), str):
+ if not isinstance(obj("test"), basestring):
raise Exception
except:
- print("Quirk malformed: %s" % (obj.command))
+ print "Quirk malformed: %s" % (obj.command)
msgbox = QtWidgets.QMessageBox()
msgbox.setWindowTitle("Error!")
msgbox.setText("Quirk malformed: %s" % (obj.command))
diff --git a/quirks.py b/quirks.py
index 2863918..7499abe 100644
--- a/quirks.py
+++ b/quirks.py
@@ -20,7 +20,7 @@ class ScriptQuirks(object):
self.last = self.quirks.copy()
self.quirks.clear()
for script in self.scripts:
- print(script.getExtension())
+ print script.getExtension()
script.load()
#print script.quirks
for q in script.quirks:
@@ -31,9 +31,9 @@ class ScriptQuirks(object):
del self.quirks[k]
#print self.quirks
if self.quirks:
- print('Registered quirks:', '(), '.join(self.quirks) + "()")
+ print 'Registered quirks:', '(), '.join(self.quirks) + "()"
else:
- print("Warning: Couldn't find any script quirks")
+ print "Warning: Couldn't find any script quirks"
def add(self, script):
self.scripts.append(script)
@@ -64,8 +64,8 @@ class ScriptQuirks(object):
module = self.loadModule(name, filename)
if module is None:
continue
- except Exception as e:
- print("Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e))
+ except Exception, e:
+ print "Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e)
msgbox = QtWidgets.QMessageBox()
msgbox.setWindowTitle("Error!")
msgbox.setText("Error loading %s: %s (in quirks.py)" % (os.path.basename(filename), e))
diff --git a/randomer.py b/randomer.py
index 4df2b04..af60239 100644
--- a/randomer.py
+++ b/randomer.py
@@ -63,6 +63,6 @@ class RandomHandler(QtCore.QObject):
msgbox.setInformativeText("Try again later :(")
msgbox.exec_()
return
- name = str(l[1])
- print(name)
+ name = unicode(l[1])
+ print name
self.mainwindow.newConversation(name)
diff --git a/toast.py b/toast.py
index 2d89cad..c040d62 100644
--- a/toast.py
+++ b/toast.py
@@ -4,29 +4,27 @@ import time, os
import ostools
from PyQt5 import QtGui, QtCore, QtWidgets
-import logging
-
try:
import pynotify
except:
pynotify = None
-class DefaultToast(QtWidgets.QWidget):
+class DefaultToast(object):
def __init__(self, parent, **kwds):
- super().__init__(parent)
+ super(DefaultToast, self).__init__(parent, **kwds)
self.machine = kwds.get('machine')
self.title = kwds.get('title')
self.msg = kwds.get('msg')
self.icon = kwds.get('icon')
def show(self):
- print(self.title, self.msg, self.icon)
+ print self.title, self.msg, self.icon
self.done()
def done(self):
t = self.machine.toasts[0]
if t.title == self.title and t.msg == self.msg and t.icon == self.icon:
self.machine.toasts.pop(0)
self.machine.displaying = False
- print("Done")
+ print "Done"
class ToastMachine(object):
class __Toast__(object):
@@ -75,7 +73,7 @@ class ToastMachine(object):
def realShow(self):
self.machine.displaying = True
t = None
- for (k,v) in self.machine.types.items():
+ for (k,v) in self.machine.types.iteritems():
if self.machine.type == k:
try:
args = inspect.getargspec(v.__init__).args
@@ -145,15 +143,15 @@ class ToastMachine(object):
if type in self.types:
if type == "libnotify":
if not pynotify or not pynotify.init("ToastMachine"):
- print("Problem initilizing pynotify")
+ print "Problem initilizing pynotify"
return
#self.type = type = "default"
elif type == "twmn":
from libs import pytwmn
try:
pytwmn.init()
- except pytwmn.ERROR as e:
- print("Problem initilizing pytwmn: " + str(e))
+ except pytwmn.ERROR, e:
+ print "Problem initilizing pytwmn: " + str(e)
return
#self.type = type = "default"
self.type = type
@@ -179,11 +177,9 @@ class ToastMachine(object):
self.showNext()
-class PesterToast(DefaultToast):
+class PesterToast(QtWidgets.QWidget, DefaultToast):
def __init__(self, machine, title, msg, icon, time=3000, parent=None):
- logging.info(isinstance(parent, QtWidgets.QWidget))
- kwds = dict(machine=machine, title=title, msg=msg, icon=icon)
- super().__init__(parent, **kwds)
+ super(PesterToast, self).__init__(self, parent, machine=machine, title=title, msg=msg, icon=icon)
self.machine = machine
self.time = time
@@ -214,6 +210,7 @@ class PesterToast(DefaultToast):
self.icon.pixmap().fill(QtGui.QColor(0,0,0,0))
layout_0 = QtWidgets.QVBoxLayout()
+ layout_0.setMargin(0)
layout_0.setContentsMargins(0, 0, 0, 0)
if self.icon:
@@ -240,7 +237,7 @@ class PesterToast(DefaultToast):
self.msg.setStyleSheet(self.parent().theme["toasts/content/style"])
self.layout().setSpacing(0)
- self.msg.setText(PesterToast.wrapText(self.msg.font(), str(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"]))
+ self.msg.setText(PesterToast.wrapText(self.msg.font(), unicode(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"]))
p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight()
o = QtWidgets.QApplication.desktop().screenGeometry(self).bottomRight()
@@ -258,8 +255,8 @@ class PesterToast(DefaultToast):
def done(self):
QtWidgets.QWidget.hide(self)
t = self.machine.toasts[0]
- if t.title == str(self.title.text()) and \
- t.msg == str(self.content):
+ if t.title == unicode(self.title.text()) and \
+ t.msg == unicode(self.content):
self.machine.toasts.pop(0)
self.machine.displaying = False
if self.machine.on:
@@ -269,7 +266,7 @@ class PesterToast(DefaultToast):
@QtCore.pyqtSlot()
def reverseTrigger(self):
if self.time >= 0:
- QtCore.QTimer.singleShot(self.time, self.reverseStart)
+ QtCore.QTimer.singleShot(self.time, self, QtCore.SLOT('reverseStart()'))
@QtCore.pyqtSlot()
def reverseStart(self):
@@ -286,7 +283,7 @@ class PesterToast(DefaultToast):
def updateBottomLeftAnimation(self, value):
p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight()
val = float(self.height())/100
- self.move(p.x()-self.width(), p.y() - (value * val) +1)
+ self.move(p.x()-self.width(), p.y() - (value.toInt()[0] * val) +1)
self.layout().setSpacing(0)
QtWidgets.QWidget.show(self)
@@ -352,7 +349,7 @@ class PesterToast(DefaultToast):
break
if (metric.width(text[:lastspace]) > maxwidth) or \
len(text[:lastspace]) < 1:
- for i in range(len(text)):
+ for i in xrange(len(text)):
if metric.width(text[:i]) > maxwidth:
lastspace = i-1
break
diff --git a/updatecheck.py b/updatecheck.py
index bba4302..b5dfd5b 100644
--- a/updatecheck.py
+++ b/updatecheck.py
@@ -34,20 +34,19 @@ class MSPAChecker(QtWidgets.QWidget):
raise
if os.path.exists("status_old.pkl"):
os.remove("status_old.pkl")
- except Exception as e:
- print(e)
+ except Exception, e:
+ print e
msg = QtWidgets.QMessageBox(self)
msg.setText("Problems writing save file.")
msg.show()
@QtCore.pyqtSlot()
def check_site_wrapper(self):
- return # turn off MSPA check; python3 doesnt like it
if not self.mainwindow.config.checkMSPA():
return
if self.lock:
return
- print("Checking MSPA updates...")
+ print "Checking MSPA updates..."
s = threading.Thread(target=self.check_site)
s.start()
@@ -89,7 +88,7 @@ class MSPAChecker(QtWidgets.QWidget):
@QtCore.pyqtSlot()
def visit_site(self):
- print(self.status['last_visited']['link'])
+ print self.status['last_visited']['link']
QtGui.QDesktopServices.openUrl(QtCore.QUrl(self.status['last_visited']['link'], QtCore.QUrl.TolerantMode))
if self.status['last_seen']['pubdate'] > self.status['last_visited']['pubdate']:
#Visited for the first time. Untrip the icon and remember that we saw it.
diff --git a/version.py b/version.py
index b9acc98..59b0d76 100644
--- a/version.py
+++ b/version.py
@@ -1,4 +1,4 @@
-import urllib.request, urllib.parse, urllib.error
+import urllib
import re
import time
try:
@@ -67,31 +67,31 @@ def lexVersion(short=False):
# Naughty I know, but it lets me grab it from the bash script.
if __name__ == "__main__":
- print(lexVersion())
+ print lexVersion()
def verStrToNum(ver):
w = re.match("(\d+\.?\d+)\.(\d+)-?([A-Za-z]{0,2})\.?(\d*):(\S+)", ver)
if not w:
- print("Update check Failure: 3"); return
+ print "Update check Failure: 3"; return
full = ver[:ver.find(":")]
return full,w.group(1),w.group(2),w.group(3),w.group(4),w.group(5)
def updateCheck(q):
time.sleep(3)
- data = urllib.parse.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE})
+ data = urllib.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE})
try:
- f = urllib.request.urlopen("http://distantsphere.com/pesterchum.php?" + data)
+ f = urllib.urlopen("http://distantsphere.com/pesterchum.php?" + data)
except:
- print("Update check Failure: 1"); return q.put((False,1))
+ print "Update check Failure: 1"; return q.put((False,1))
newest = f.read()
f.close()
if not newest or newest[0] == "<":
- print("Update check Failure: 2"); return q.put((False,2))
+ print "Update check Failure: 2"; return q.put((False,2))
try:
(full, major, minor, status, revision, url) = verStrToNum(newest)
except TypeError:
return q.put((False,3))
- print(full)
+ print full
if major <= _pcMajor:
if minor <= _pcMinor:
if status:
@@ -102,7 +102,7 @@ def updateCheck(q):
if not _pcStatus:
if revision <= _pcRevision:
return q.put((False,0))
- print("A new version of Pesterchum is avaliable!")
+ print "A new version of Pesterchum is avaliable!"
q.put((full,url))
@@ -128,9 +128,9 @@ def copyUpdate(path):
def updateExtract(url, extension):
if extension:
fn = "update" + extension
- urllib.request.urlretrieve(url, fn)
+ urllib.urlretrieve(url, fn)
else:
- fn = urllib.request.urlretrieve(url)[0]
+ fn = urllib.urlretrieve(url)[0]
if tarfile and tarfile.is_tarfile(fn):
extension = ".tar.gz"
elif zipfile.is_zipfile(fn):
@@ -144,17 +144,17 @@ def updateExtract(url, extension):
except:
pass
- print(url, fn, extension)
+ print url, fn, extension
if extension == ".exe":
pass
elif extension == ".zip" or extension.startswith(".tar"):
if extension == ".zip":
from zipfile import is_zipfile as is_updatefile, ZipFile as openupdate
- print("Opening .zip")
+ print "Opening .zip"
elif tarfile and extension.startswith(".tar"):
from tarfile import is_tarfile as is_updatefile, open as openupdate
- print("Opening .tar")
+ print "Opening .tar"
else:
return