From 5d839aae47f78469b716b30ec1f1f3abbc487b7e Mon Sep 17 00:00:00 2001 From: karxi Date: Sun, 13 Nov 2016 01:12:58 -0500 Subject: [PATCH] Revert "python 2 to 3 first shot" This reverts commit 7bc57b8b7d9c4e0c74809c7057cf28abe87970b9. Practically speaking, this reverts the Python 3 changes, since they're broken. --- bugreport.py | 20 +- convo.py | 38 +- dataobjs.py | 14 +- generic.py | 12 +- irc.py | 89 +- libs/feedparser.py | 3704 +++++++++++++++---------------------------- libs/magic.py | 174 +- libs/pytwmn.py | 4 +- libs/sgmllib.py | 547 ------- logviewer.py | 10 +- luaquirks.py | 6 +- memos.py | 66 +- menus.py | 42 +- mispeller.py | 4 +- ostools.py | 6 +- oyoyo/client.py | 21 +- oyoyo/cmdhandler.py | 14 +- oyoyo/examplebot.py | 4 +- oyoyo/helpers.py | 2 +- oyoyo/ircevents.py | 4 +- oyoyo/services.py | 4 +- parsetools.py | 24 +- pesterchum.py | 276 ++-- profile.py | 88 +- pyquirks.py | 8 +- quirks.py | 10 +- randomer.py | 4 +- toast.py | 37 +- updatecheck.py | 9 +- version.py | 28 +- 30 files changed, 1737 insertions(+), 3532 deletions(-) delete mode 100644 libs/sgmllib.py diff --git a/bugreport.py b/bugreport.py index c57b8bb..04f7cff 100644 --- a/bugreport.py +++ b/bugreport.py @@ -1,5 +1,5 @@ from PyQt5 import QtGui, QtCore, QtWidgets -import urllib.request, urllib.parse, urllib.error +import urllib import ostools import version @@ -51,13 +51,13 @@ class BugReporter(QtWidgets.QDialog): @QtCore.pyqtSlot() def sendReport(self): - name = str(self.mainwindow.profile().handle) - bestname = str(self.name.text()) + name = unicode(self.mainwindow.profile().handle) + bestname = unicode(self.name.text()) os = ostools.osVer() full = ostools.platform.platform() python = ostools.platform.python_version() qt = QtCore.qVersion() - msg = str(self.textarea.toPlainText()) + msg = unicode(self.textarea.toPlainText()) if len(bestname) <= 0 or len(msg) <= 0: msgbox = QtWidgets.QMessageBox() @@ -68,13 +68,13 @@ class BugReporter(QtWidgets.QDialog): return QtWidgets.QDialog.accept(self) - data = urllib.parse.urlencode({"name":name, "version": version._pcVersion, "bestname":bestname, "os":os, "platform":full, "python":python, "qt":qt, "msg":msg}) - print("Sending...") - f = urllib.request.urlopen("http://distantsphere.com/pc/reporter.php", data) + data = urllib.urlencode({"name":name, "version": version._pcVersion, "bestname":bestname, "os":os, "platform":full, "python":python, "qt":qt, "msg":msg}) + print "Sending..." + f = urllib.urlopen("http://distantsphere.com/pc/reporter.php", data) text = f.read() - print(text) + print text if text == "success!": - print("Sent!") + print "Sent!" else: - print("Problems ):") + print "Problems ):" diff --git a/convo.py b/convo.py index 1a7fc09..b9a1c93 100644 --- a/convo.py +++ b/convo.py @@ -1,7 +1,7 @@ from string import Template import re import platform -import http.client, urllib.request, urllib.parse, urllib.error +import httplib, urllib from time import strftime from copy import copy from datetime import datetime, timedelta @@ -77,7 +77,7 @@ class PesterTabWindow(QtWidgets.QFrame): mods = event.modifiers() if ((mods & QtCore.Qt.ControlModifier) and keypress == QtCore.Qt.Key_Tab): - handles = list(self.convos.keys()) + handles = self.convos.keys() waiting = self.mainwindow.waitingMessages.waitingHandles() waitinghandles = list(set(handles) & set(waiting)) if len(waitinghandles) > 0: @@ -114,7 +114,7 @@ class PesterTabWindow(QtWidgets.QFrame): i = self.tabs.tabAt(self.mapFromGlobal(QtGui.QCursor.pos())) if i == -1: i = self.tabs.currentIndex() - handle = str(self.tabs.tabText(i)) + handle = unicode(self.tabs.tabText(i)) self.clearNewMessage(handle) def convoHasFocus(self, handle): i = self.tabIndices[handle] @@ -151,19 +151,19 @@ class PesterTabWindow(QtWidgets.QFrame): self.tabs.setTabIcon(tabi, c.icon()) currenttabi = self.tabs.currentIndex() if currenttabi >= 0: - currentHandle = str(self.tabs.tabText(self.tabs.currentIndex())) + currentHandle = unicode(self.tabs.tabText(self.tabs.currentIndex())) self.setWindowIcon(self.convos[currentHandle].icon()) self.defaultTabTextColor = self.getTabTextColor() @QtCore.pyqtSlot(int) def tabClose(self, i): - handle = str(self.tabs.tabText(i)) + handle = unicode(self.tabs.tabText(i)) self.mainwindow.waitingMessages.messageAnswered(handle) convo = self.convos[handle] del self.convos[handle] del self.tabIndices[handle] self.tabs.removeTab(i) - for (h, j) in self.tabIndices.items(): + for (h, j) in self.tabIndices.iteritems(): if j > i: self.tabIndices[h] = j-1 self.layout.removeWidget(convo) @@ -173,7 +173,7 @@ class PesterTabWindow(QtWidgets.QFrame): return if self.currentConvo == convo: currenti = self.tabs.currentIndex() - currenth = str(self.tabs.tabText(currenti)) + currenth = unicode(self.tabs.tabText(currenti)) self.currentConvo = self.convos[currenth] self.currentConvo.raiseChat() @@ -184,7 +184,7 @@ class PesterTabWindow(QtWidgets.QFrame): if self.changedTab: self.changedTab = False return - handle = str(self.tabs.tabText(i)) + handle = unicode(self.tabs.tabText(i)) convo = self.convos[handle] if self.currentConvo: self.layout.removeWidget(self.currentConvo) @@ -219,7 +219,7 @@ class PesterMovie(QtGui.QMovie): if text.mainwindow.config.animations(): movie = self url = text.urls[movie].toString() - html = str(text.toHtml()) + html = unicode(text.toHtml()) if html.find(url) != -1: if text.hasTabs: i = text.tabobject.tabIndices[text.parent().title()] @@ -265,13 +265,13 @@ class PesterText(QtWidgets.QTextEdit): def animateChanged(self, animate): if animate: for m in self.urls: - html = str(self.toHtml()) + html = unicode(self.toHtml()) if html.find(self.urls[m].toString()) != -1: if m.frameCount() > 1: m.start() else: for m in self.urls: - html = str(self.toHtml()) + html = unicode(self.toHtml()) if html.find(self.urls[m].toString()) != -1: if m.frameCount() > 1: m.stop() @@ -280,7 +280,7 @@ class PesterText(QtWidgets.QTextEdit): def textReady(self, ready): self.textSelected = ready def initTheme(self, theme): - if "convo/scrollbar" in theme: + if theme.has_key("convo/scrollbar"): self.setStyleSheet("QTextEdit { %s } QScrollBar:vertical { %s } QScrollBar::handle:vertical { %s } QScrollBar::add-line:vertical { %s } QScrollBar::sub-line:vertical { %s } QScrollBar:up-arrow:vertical { %s } QScrollBar:down-arrow:vertical { %s }" % (theme["convo/textarea/style"], theme["convo/scrollbar/style"], theme["convo/scrollbar/handle"], theme["convo/scrollbar/downarrow"], theme["convo/scrollbar/uparrow"], theme["convo/scrollbar/uarrowstyle"], theme["convo/scrollbar/darrowstyle"] )) else: self.setStyleSheet("QTextEdit { %s }" % (theme["convo/textarea/style"])) @@ -393,7 +393,7 @@ class PesterText(QtWidgets.QTextEdit): if url[0] == "#" and url != "#pesterchum": self.parent().mainwindow.showMemos(url[1:]) elif url[0] == "@": - handle = str(url[1:]) + handle = unicode(url[1:]) self.parent().mainwindow.newConversation(handle) else: if event.modifiers() == QtCore.Qt.ControlModifier: @@ -435,12 +435,12 @@ class PesterText(QtWidgets.QTextEdit): layout.addWidget(cancelbutton) self.sending.setLayout(layout) self.sending.show() - params = urllib.parse.urlencode({'quote': logdata, 'do': "add"}) + params = urllib.urlencode({'quote': logdata, 'do': "add"}) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"} try: pass - hconn = http.client.HTTPConnection('qdb.pesterchum.net', 80, + hconn = httplib.HTTPConnection('qdb.pesterchum.net', 80, timeout=15) hconn.request("POST", "/index.php", params, headers) response = hconn.getresponse() @@ -449,7 +449,7 @@ class PesterText(QtWidgets.QTextEdit): else: self.sending.sendinglabel.setText("F41L3D: %s %s" % (response.status, response.reason)) hconn.close() - except Exception as e: + except Exception, e: self.sending.sendinglabel.setText("F41L3D: %s" % (e)) del self.sending @@ -465,7 +465,7 @@ class PesterInput(QtWidgets.QLineEdit): QtWidgets.QLineEdit.focusInEvent(self, event) def keyPressEvent(self, event): if event.key() == QtCore.Qt.Key_Up: - text = str(self.text()) + text = unicode(self.text()) next = self.parent().history.next(text) if next is not None: self.setText(next) @@ -596,7 +596,7 @@ class PesterConvo(QtWidgets.QFrame): def updateColor(self, color): self.chum.color = color def addMessage(self, msg, me=True): - if type(msg) is str: + if type(msg) in [str, unicode]: lexmsg = lexMessage(msg) else: lexmsg = msg @@ -696,7 +696,7 @@ class PesterConvo(QtWidgets.QFrame): @QtCore.pyqtSlot() def sentMessage(self): - text = str(self.textInput.text()) + text = unicode(self.textInput.text()) if text == "" or text[0:11] == "PESTERCHUM:": return oocDetected = oocre.match(text.strip()) diff --git a/dataobjs.py b/dataobjs.py index e365a46..2f86f8f 100644 --- a/dataobjs.py +++ b/dataobjs.py @@ -111,7 +111,7 @@ class pesterQuirks(object): newlist = [] for (i, o) in enumerate(lexed): - if type(o) not in [str]: + if type(o) not in [str, unicode]: if i == 0: string = " " for p in prefix: @@ -135,7 +135,7 @@ class pesterQuirks(object): final = [] for n in newlist: - if type(n) in [str]: + if type(n) in [str, unicode]: final.extend(lexMessage(n)) else: final.append(n) @@ -191,9 +191,9 @@ class PesterProfile(object): def plaindict(self): return (self.handle, {"handle": self.handle, "mood": self.mood.name(), - "color": str(self.color.name()), - "group": str(self.group), - "notes": str(self.notes)}) + "color": unicode(self.color.name()), + "group": unicode(self.group), + "notes": unicode(self.notes)}) def blocked(self, config): return self.handle in config.getBlocklist() @@ -238,7 +238,7 @@ class PesterProfile(object): (opchum.colorhtml(), opinit, self.colorhtml(), ", ".join(initials)) else: return "%s banned %s from responding to memo: [%s]." % \ - (opchum.colorhtml(), opinit, self.colorhtml(), ", ".join(initials), str(reason)) + (opchum.colorhtml(), opinit, self.colorhtml(), ", ".join(initials), unicode(reason)) else: initials = timeGrammar.pcf+self.initials()+timeGrammar.number if opchum.handle == reason: @@ -246,7 +246,7 @@ class PesterProfile(object): (opchum.colorhtml(), opinit, self.colorhtml(), initials) else: return "%s banned %s from responding to memo: [%s]." % \ - (opchum.colorhtml(), opinit, self.colorhtml(), initials, str(reason)) + (opchum.colorhtml(), opinit, self.colorhtml(), initials, unicode(reason)) def memopermabanmsg(self, opchum, opgrammar, syscolor, timeGrammar): initials = timeGrammar.pcf+self.initials()+timeGrammar.number opinit = opgrammar.pcf+opchum.initials()+opgrammar.number diff --git a/generic.py b/generic.py index 27efeb6..35fdb2a 100644 --- a/generic.py +++ b/generic.py @@ -8,8 +8,6 @@ class mysteryTime(timedelta): return (type(other) is mysteryTime) def __neq__(self, other): return (type(other) is not mysteryTime) - def __hash__(self): - return 0 class CaseInsensitiveDict(dict): def __setitem__(self, key, value): @@ -19,7 +17,7 @@ class CaseInsensitiveDict(dict): def __contains__(self, key): return super(CaseInsensitiveDict, self).__contains__(key.lower()) def has_key(self, key): - return key.lower() in super(CaseInsensitiveDict, self) + return super(CaseInsensitiveDict, self).has_key(key.lower()) def __delitem__(self, key): super(CaseInsensitiveDict, self).__delitem__(key.lower()) @@ -30,7 +28,7 @@ class PesterList(list): class PesterIcon(QtGui.QIcon): def __init__(self, *x): QtGui.QIcon.__init__(self, x[0]) - if type(x[0]) in [str]: + if type(x[0]) in [str, unicode]: self.icon_pixmap = QtGui.QPixmap(x[0]) else: self.icon_pixmap = None @@ -98,8 +96,8 @@ class MultiTextDialog(QtWidgets.QDialog): r = self.exec_() if r == QtWidgets.QDialog.Accepted: retval = {} - for (name, widget) in self.inputs.items(): - retval[name] = str(widget.text()) + for (name, widget) in self.inputs.iteritems(): + retval[name] = unicode(widget.text()) return retval else: return None @@ -127,7 +125,7 @@ class MovingWindow(QtWidgets.QFrame): class NoneSound(object): def play(self): pass - def setVolume(self, v): pass + def set_volume(self, v): pass class WMButton(QtWidgets.QPushButton): def __init__(self, icon, parent=None): diff --git a/irc.py b/irc.py index a9e2330..a4e8a2c 100644 --- a/irc.py +++ b/irc.py @@ -38,7 +38,7 @@ class PesterIRC(QtCore.QThread): def run(self): try: self.IRCConnect() - except socket.error as se: + except socket.error, se: self.stopIRC = se return while 1: @@ -46,12 +46,12 @@ class PesterIRC(QtCore.QThread): try: logging.debug("updateIRC()") res = self.updateIRC() - except socket.timeout as se: + except socket.timeout, se: logging.debug("timeout in thread %s" % (self)) self.cli.close() self.stopIRC = se return - except socket.error as se: + except socket.error, se: if self.registeredIRC: self.stopIRC = None else: @@ -73,13 +73,13 @@ class PesterIRC(QtCore.QThread): @QtCore.pyqtSlot() def updateIRC(self): try: - res = next(self.conn) - except socket.timeout as se: + res = self.conn.next() + except socket.timeout, se: if self.registeredIRC: return True else: raise se - except socket.error as se: + except socket.error, se: raise se except StopIteration: self.conn = self.cli.conn() @@ -99,16 +99,16 @@ class PesterIRC(QtCore.QThread): self.cli.command_handler.getMood(*chums) @QtCore.pyqtSlot('QString', 'QString') def sendNotice(self, text, handle): - h = str(handle) - t = str(text) + h = unicode(handle) + t = unicode(text) try: helpers.notice(self.cli, h, t) except socket.error: self.setConnectionBroken() @QtCore.pyqtSlot('QString', 'QString') def sendMessage(self, text, handle): - h = str(handle) - textl = [str(text)] + h = unicode(handle) + textl = [unicode(text)] def splittext(l): if len(l[0]) > 450: space = l[0].rfind(" ", 0,430) @@ -153,7 +153,7 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString', bool) def startConvo(self, handle, initiated): - h = str(handle) + h = unicode(handle) try: if initiated: helpers.msg(self.cli, h, "PESTERCHUM:BEGIN") @@ -162,7 +162,7 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString') def endConvo(self, handle): - h = str(handle) + h = unicode(handle) try: helpers.msg(self.cli, h, "PESTERCHUM:CEASE") except socket.error: @@ -197,21 +197,21 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString') def blockedChum(self, handle): - h = str(handle) + h = unicode(handle) try: helpers.msg(self.cli, h, "PESTERCHUM:BLOCK") except socket.error: self.setConnectionBroken() @QtCore.pyqtSlot('QString') def unblockedChum(self, handle): - h = str(handle) + h = unicode(handle) try: helpers.msg(self.cli, h, "PESTERCHUM:UNBLOCK") except socket.error: self.setConnectionBroken() @QtCore.pyqtSlot('QString') def requestNames(self, channel): - c = str(channel) + c = unicode(channel) try: helpers.names(self.cli, c) except socket.error: @@ -224,7 +224,7 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString') def joinChannel(self, channel): - c = str(channel) + c = unicode(channel) try: helpers.join(self.cli, c) helpers.mode(self.cli, c, "", None) @@ -232,7 +232,7 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString') def leftChannel(self, channel): - c = str(channel) + c = unicode(channel) try: helpers.part(self.cli, c) self.cli.command_handler.joined = False @@ -241,13 +241,13 @@ class PesterIRC(QtCore.QThread): @QtCore.pyqtSlot('QString', 'QString') def kickUser(self, handle, channel): l = handle.split(":") - c = str(channel) - h = str(l[0]) + c = unicode(channel) + h = unicode(l[0]) if len(l) > 1: - reason = str(l[1]) + reason = unicode(l[1]) if len(l) > 2: for x in l[2:]: - reason += str(":") + str(x) + reason += unicode(":") + unicode(x) else: reason = "" try: @@ -256,9 +256,9 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString', 'QString', 'QString') def setChannelMode(self, channel, mode, command): - c = str(channel) - m = str(mode) - cmd = str(command) + c = unicode(channel) + m = unicode(mode) + cmd = unicode(command) if cmd == "": cmd = None try: @@ -267,15 +267,15 @@ class PesterIRC(QtCore.QThread): self.setConnectionBroken() @QtCore.pyqtSlot('QString') def channelNames(self, channel): - c = str(channel) + c = unicode(channel) try: helpers.names(self.cli, c) except socket.error: self.setConnectionBroken() @QtCore.pyqtSlot('QString', 'QString') def inviteChum(self, handle, channel): - h = str(handle) - c = str(channel) + h = unicode(handle) + c = unicode(channel) try: helpers.invite(self.cli, h, c) except socket.error: @@ -300,8 +300,8 @@ class PesterIRC(QtCore.QThread): @QtCore.pyqtSlot('QString', 'QString') def killSomeQuirks(self, channel, handle): - c = str(channel) - h = str(handle) + c = unicode(channel) + h = unicode(handle) try: helpers.ctcp(self.cli, c, "NOQUIRKS", h) except socket.error: @@ -333,8 +333,6 @@ class PesterHandler(DefaultCommandHandler): msg = msg.decode('utf-8') except UnicodeDecodeError: msg = msg.decode('iso-8859-1', 'ignore') - nick = nick.decode('utf-8') - chan = chan.decode('utf-8') handle = nick[0:nick.find("!")] logging.info("---> recv \"NOTICE %s :%s\"" % (handle, msg)) if handle == "ChanServ" and chan == self.parent.mainwindow.profile().handle and msg[0:2] == "[#": @@ -346,8 +344,6 @@ class PesterHandler(DefaultCommandHandler): msg = msg.decode('utf-8') except UnicodeDecodeError: msg = msg.decode('iso-8859-1', 'ignore') - nick = nick.decode('utf-8') - chan = chan.decode('utf-8') # display msg, do other stuff if len(msg) == 0: return @@ -410,11 +406,8 @@ class PesterHandler(DefaultCommandHandler): def nicknameinuse(self, server, cmd, nick, msg): newnick = "pesterClient%d" % (random.randint(100,999)) helpers.nick(self.client, newnick) - nick = nick.decode('utf-8') self.parent.nickCollision.emit(nick, newnick) def quit(self, nick, reason): - nick = nick.decode('utf-8') - reason = reason.decode('utf-8') handle = nick[0:nick.find("!")] logging.info("---> recv \"QUIT %s: %s\"" % (handle, reason)) if handle == self.parent.mainwindow.randhandler.randNick: @@ -427,22 +420,17 @@ class PesterHandler(DefaultCommandHandler): self.parent.userPresentUpdate.emit(handle, "", "quit") self.parent.moodUpdated.emit(handle, Mood("offline")) def kick(self, opnick, channel, handle, reason): - opnick = opnick.decode('utf-8') op = opnick[0:opnick.find("!")] self.parent.userPresentUpdate.emit(handle, channel, "kick:%s:%s" % (op, reason)) # ok i shouldnt be overloading that but am lazy def part(self, nick, channel, reason="nanchos"): - nick = nick.decode('utf-8') - channel = channel.decode('utf-8') handle = nick[0:nick.find("!")] logging.info("---> recv \"PART %s: %s\"" % (handle, channel)) self.parent.userPresentUpdate.emit(handle, channel, "left") if channel == "#pesterchum": self.parent.moodUpdated.emit(handle, Mood("offline")) def join(self, nick, channel): - nick = nick.decode('utf-8') handle = nick[0:nick.find("!")] - channel = channel.decode('utf-8') logging.info("---> recv \"JOIN %s: %s\"" % (handle, channel)) self.parent.userPresentUpdate.emit(handle, channel, "join") if channel == "#pesterchum": @@ -450,9 +438,6 @@ class PesterHandler(DefaultCommandHandler): self.parent.mainwindow.randhandler.setRunning(True) self.parent.moodUpdated.emit(handle, Mood("chummy")) def mode(self, op, channel, mode, *handles): - op = op.decode('utf-8') - channel = channel.decode('utf-8') - mode = mode.decode('utf-8') if len(handles) <= 0: handles = [""] opnick = op[0:op.find("!")] if op == channel or channel == self.parent.mainwindow.profile().handle: @@ -482,8 +467,6 @@ class PesterHandler(DefaultCommandHandler): except IndexError: self.parent.userPresentUpdate.emit("", channel, m+":%s" % (op)) def nick(self, oldnick, newnick): - oldnick = oldnick.decode('utf-8') - newnick = newnick.decode('utf-8') oldhandle = oldnick[0:oldnick.find("!")] if oldhandle == self.mainwindow.profile().handle: self.parent.myHandleChanged.emit(newnick) @@ -497,8 +480,6 @@ class PesterHandler(DefaultCommandHandler): elif newnick == self.parent.mainwindow.randhandler.randNick: self.parent.mainwindow.randhandler.setRunning(True) def namreply(self, server, nick, op, channel, names): - channel = channel.decode('utf-8') - names = names.decode('utf-8') namelist = names.split(" ") logging.info("---> recv \"NAMES %s: %d names\"" % (channel, len(namelist))) if not hasattr(self, 'channelnames'): @@ -507,7 +488,6 @@ class PesterHandler(DefaultCommandHandler): self.channelnames[channel] = [] self.channelnames[channel].extend(namelist) def endofnames(self, server, nick, channel, msg): - channel = channel.decode('utf-8') namelist = self.channelnames[channel] pl = PesterList(namelist) del self.channelnames[channel] @@ -525,11 +505,10 @@ class PesterHandler(DefaultCommandHandler): def liststart(self, server, handle, *info): self.channel_list = [] - info = [i.decode('utf-8') for i in info] + info = list(info) self.channel_field = info.index("Channel") # dunno if this is protocol logging.info("---> recv \"CHANNELS: %s " % (self.channel_field)) def list(self, server, handle, *info): - info = [i.decode('utf-8') for i in info] channel = info[self.channel_field] usercount = info[1] if channel not in self.channel_list and channel != "#pesterchum": @@ -542,29 +521,21 @@ class PesterHandler(DefaultCommandHandler): self.channel_list = [] def umodeis(self, server, handle, modes): - modes = modes.decode('utf-8') self.parent.mainwindow.modes = modes def invite(self, sender, you, channel): - sender = sender.decode('utf-8') handle = sender.split('!')[0] self.parent.inviteReceived.emit(handle, channel) def inviteonlychan(self, server, handle, channel, msg): - channel = channel.decode('utf-8') self.parent.chanInviteOnly.emit(channel) def channelmodeis(self, server, handle, channel, modes): - modes = modes.decode('utf-8') - channel = channel.decode('utf-8') self.parent.modesUpdated.emit(channel, modes) def cannotsendtochan(self, server, handle, channel, msg): - msg = msg.decode('utf-8') - channel = channel.decode('utf-8') self.parent.cannotSendToChan.emit(channel, msg) def toomanypeeps(self, *stuff): self.parent.tooManyPeeps.emit() def ping(self, prefix, server): self.parent.mainwindow.lastping = int(time()) - server = server.decode('utf-8') self.client.send('PONG', server) def getMood(self, *chums): diff --git a/libs/feedparser.py b/libs/feedparser.py index 540f6e5..bb802df 100755 --- a/libs/feedparser.py +++ b/libs/feedparser.py @@ -1,19 +1,18 @@ +#!/usr/bin/env python """Universal feed parser Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds -Visit https://code.google.com/p/feedparser/ for the latest version -Visit http://packages.python.org/feedparser/ for the latest documentation +Visit http://feedparser.org/ for the latest version +Visit http://feedparser.org/docs/ for the latest documentation -Required: Python 2.4 or later -Recommended: iconv_codec +Required: Python 2.1 or later +Recommended: Python 2.3 or later +Recommended: CJKCodecs and iconv_codec """ -__version__ = "5.1.3" -__license__ = """ -Copyright (c) 2010-2012 Kurt McKee -Copyright (c) 2002-2008 Mark Pilgrim -All rights reserved. +__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -40,17 +39,13 @@ __contributors__ = ["Jason Diamond ", "John Beimler ", "Fazal Majid ", "Aaron Swartz ", - "Kevin Marks ", - "Sam Ruby ", - "Ade Oshineye ", - "Martin Pool ", - "Kurt McKee ", - "Bernd Schlapsi ",] + "Kevin Marks "] +_debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should # change this to your application name and URL. -USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__ +USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ # HTTP "Accept" header to send to servers when downloading feeds. If you don't # want to send an Accept header, set this to None. @@ -70,221 +65,69 @@ TIDY_MARKUP = 0 # if TIDY_MARKUP = 1 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] -# If you want feedparser to automatically resolve all relative URIs, set this -# to 1. -RESOLVE_RELATIVE_URIS = 1 - -# If you want feedparser to automatically sanitize all potentially unsafe -# HTML content, set this to 1. -SANITIZE_HTML = 1 - -# If you want feedparser to automatically parse microformat content embedded -# in entry contents, set this to 1 -PARSE_MICROFORMATS = 1 - -# ---------- Python 3 modules (make it work if possible) ---------- -try: - import rfc822 -except ImportError: - from email import _parseaddr as rfc822 - -try: - # Python 3.1 introduces bytes.maketrans and simultaneously - # deprecates string.maketrans; use bytes.maketrans if possible - _maketrans = bytes.maketrans -except (NameError, AttributeError): - import string - _maketrans = string.maketrans - -# base64 support for Atom feeds that contain embedded binary data -try: - import base64, binascii -except ImportError: - base64 = binascii = None -else: - # Python 3.1 deprecates decodestring in favor of decodebytes - _base64decode = getattr(base64, 'decodebytes', base64.decodestring) - -# _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3 -# _l2bytes: convert a list of ints to bytes if the interpreter is Python 3 -try: - if bytes is str: - # In Python 2.5 and below, bytes doesn't exist (NameError) - # In Python 2.6 and above, bytes and str are the same type - raise NameError -except NameError: - # Python 2 - def _s2bytes(s): - return s - def _l2bytes(l): - return ''.join(map(chr, l)) -else: - # Python 3 - def _s2bytes(s): - return bytes(s, 'utf8') - def _l2bytes(l): - return bytes(l) - -# If you want feedparser to allow all URL schemes, set this to () -# List culled from Python's urlparse documentation at: -# http://docs.python.org/library/urlparse.html -# as well as from "URI scheme" at Wikipedia: -# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme -# Many more will likely need to be added! -ACCEPTABLE_URI_SCHEMES = ( - 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', - 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', - 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', - 'wais', - # Additional common-but-unofficial schemes - 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', - 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', -) -#ACCEPTABLE_URI_SCHEMES = () - # ---------- required modules (should come with any Python distribution) ---------- -import cgi -import codecs -import copy -import datetime -import re -import struct -import time -import types -import urllib.request, urllib.parse, urllib.error -import urllib.request, urllib.error, urllib.parse -import urllib.parse -import warnings - -from html.entities import name2codepoint, codepoint2name, entitydefs - +import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 try: - from io import BytesIO as _StringIO -except ImportError: - try: - from io import StringIO as _StringIO - except ImportError: - from io import StringIO as _StringIO + from cStringIO import StringIO as _StringIO +except: + from StringIO import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- # gzip is included with most Python distributions, but may not be available if you compiled your own try: import gzip -except ImportError: +except: gzip = None try: import zlib -except ImportError: +except: zlib = None # If a real XML parser is available, feedparser will attempt to use it. feedparser has -# been tested with the built-in SAX parser and libxml2. On platforms where the +# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape -except ImportError: + _XML_AVAILABLE = 1 +except: _XML_AVAILABLE = 0 - def _xmlescape(data,entities={}): + def _xmlescape(data): data = data.replace('&', '&') data = data.replace('>', '>') data = data.replace('<', '<') - for char, entity in entities: - data = data.replace(char, entity) return data -else: - try: - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers - except xml.sax.SAXReaderNotAvailable: - _XML_AVAILABLE = 0 - else: - _XML_AVAILABLE = 1 -# sgmllib is not available by default in Python 3; if the end user doesn't have -# it available then we'll lose illformed XML parsing, content santizing, and -# microformat support (at least while feedparser depends on BeautifulSoup). +# base64 support for Atom feeds that contain embedded binary data try: - import sgmllib -except ImportError: - # This is probably Python 3, which doesn't include sgmllib anymore - _SGML_AVAILABLE = 0 + import base64, binascii +except: + base64 = binascii = None - # Mock sgmllib enough to allow subclassing later on - class sgmllib(object): - class SGMLParser(object): - def goahead(self, i): - pass - def parse_starttag(self, i): - pass -else: - _SGML_AVAILABLE = 1 - - # sgmllib defines a number of module-level regular expressions that are - # insufficient for the XML parsing feedparser needs. Rather than modify - # the variables directly in sgmllib, they're defined here using the same - # names, and the compiled code objects of several sgmllib.SGMLParser - # methods are copied into _BaseHTMLProcessor so that they execute in - # feedparser's scope instead of sgmllib's scope. - charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') - tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') - attrfind = re.compile( - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?' - ) - - # Unfortunately, these must be copied over to prevent NameError exceptions - entityref = sgmllib.entityref - incomplete = sgmllib.incomplete - interesting = sgmllib.interesting - shorttag = sgmllib.shorttag - shorttagopen = sgmllib.shorttagopen - starttagopen = sgmllib.starttagopen - - class _EndBracketRegEx: - def __init__(self): - # Overriding the built-in sgmllib.endbracket regex allows the - # parser to find angle brackets embedded in element attributes. - self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') - def search(self, target, index=0): - match = self.endbracket.match(target, index) - if match is not None: - # Returning a new object in the calling thread's context - # resolves a thread-safety. - return EndBracketMatch(match) - return None - class EndBracketMatch: - def __init__(self, match): - self.match = match - def start(self, n): - return self.match.end(n) - endbracket = _EndBracketRegEx() - - -# iconv_codec provides support for more character encodings. -# It's available from http://cjkpython.i18n.org/ +# cjkcodecs and iconv_codec provide support for more character encodings. +# Both are available from http://cjkpython.i18n.org/ +try: + import cjkcodecs.aliases +except: + pass try: import iconv_codec -except ImportError: +except: pass # chardet library auto-detects character encodings # Download from http://chardet.feedparser.org/ try: import chardet -except ImportError: + if _debug: + import chardet.constants + chardet.constants._debug = 1 +except: chardet = None -# BeautifulSoup is used to extract microformat content from HTML -# feedparser is tested using BeautifulSoup 3.2.0 -# http://www.crummy.com/software/BeautifulSoup/ -try: - import BeautifulSoup -except ImportError: - BeautifulSoup = None - PARSE_MICROFORMATS = False - # ---------- don't touch these ---------- class ThingsNobodyCaresAboutButMe(Exception): pass class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass @@ -292,6 +135,10 @@ class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass class UndeclaredNamespace(Exception): pass +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +sgmllib.special = re.compile('' % (tag, self.strattrs(attrs)), escape=0) + # Note: probably shouldn't simply recreate localname here, but + # our namespace handling isn't actually 100% correct in cases where + # the feed redefines the default namespace (which is actually + # the usual case for inline content, thanks Sam), so here we + # cheat and just reconstruct the element based on localname + # because that compensates for the bugs in our namespace handling. + # This will horribly munge inline content with non-empty qnames, + # but nobody actually does that, so I'm not fixing it. + tag = tag.split(':')[-1] + return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) # match namespaces - if tag.find(':') != -1: + if tag.find(':') <> -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag @@ -663,50 +449,37 @@ class _FeedParserMixin: self.intextinput = 0 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): self.inimage = 0 - + # call special handler (if defined) or default handler methodname = '_start_' + prefix + suffix try: method = getattr(self, methodname) return method(attrsD) except AttributeError: - # Since there's no handler or something has gone wrong we explicitly add the element and its attributes - unknown_tag = prefix + suffix - if len(attrsD) == 0: - # No attributes so merge it into the encosing dictionary - return self.push(unknown_tag, 1) - else: - # Has attributes so create it in its own dictionary - context = self._getContext() - context[unknown_tag] = attrsD + return self.push(prefix + suffix, 1) def unknown_endtag(self, tag): + if _debug: sys.stderr.write('end %s\n' % tag) # match namespaces - if tag.find(':') != -1: + if tag.find(':') <> -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' - if suffix == 'svg' and self.svgOK: - self.svgOK -= 1 # call special handler (if defined) or default handler methodname = '_end_' + prefix + suffix try: - if self.svgOK: - raise AttributeError() method = getattr(self, methodname) method() except AttributeError: self.pop(prefix + suffix) # track inline content - if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): + if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): # element declared itself as escaped markup, but it isn't really - if tag in ('xhtml:div', 'div'): - return # typepad does this 10/2007 self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': tag = tag.split(':')[-1] @@ -722,12 +495,9 @@ class _FeedParserMixin: if self.langstack: # and (self.langstack[-1] is not None): self.lang = self.langstack[-1] - self.depth -= 1 - def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: - return + if not self.elementstack: return ref = ref.lower() if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): text = '&#%s;' % ref @@ -736,33 +506,34 @@ class _FeedParserMixin: c = int(ref[1:], 16) else: c = int(ref) - text = chr(c).encode('utf-8') + text = unichr(c).encode('utf-8') self.elementstack[-1][2].append(text) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: - return + if not self.elementstack: return + if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref - elif ref in self.entities: - text = self.entities[ref] - if text.startswith('&#') and text.endswith(';'): - return self.handle_entityref(text) else: - try: - name2codepoint[ref] - except KeyError: - text = '&%s;' % ref - else: - text = chr(name2codepoint[ref]).encode('utf-8') + # entity resolution graciously donated by Aaron Swartz + def name2cp(k): + import htmlentitydefs + if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 + return htmlentitydefs.name2codepoint[k] + k = htmlentitydefs.entitydefs[k] + if k.startswith('&#') and k.endswith(';'): + return int(k[2:-1]) # not in latin-1 + return ord(k) + try: name2cp(ref) + except KeyError: text = '&%s;' % ref + else: text = unichr(name2cp(ref)).encode('utf-8') self.elementstack[-1][2].append(text) def handle_data(self, text, escape=1): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references - if not self.elementstack: - return + if not self.elementstack: return if escape and self.contentparams.get('type') == 'application/xhtml+xml': text = _xmlescape(text) self.elementstack[-1][2].append(text) @@ -780,46 +551,39 @@ class _FeedParserMixin: def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks + if _debug: sys.stderr.write('entering parse_declaration\n') if self.rawdata[i:i+9] == '', i) - if k == -1: - # CDATA block began but didn't finish - k = len(self.rawdata) - return k + if k == -1: k = len(self.rawdata) self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) return k+3 else: k = self.rawdata.find('>', i) - if k >= 0: - return k+1 - else: - # We have an incomplete CDATA block. - return k + return k+1 def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text' or contentType == 'plain': + if contentType == 'text': contentType = 'text/plain' elif contentType == 'html': contentType = 'text/html' elif contentType == 'xhtml': contentType = 'application/xhtml+xml' return contentType - + def trackNamespace(self, prefix, uri): loweruri = uri.lower() - if not self.version: - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): - self.version = 'rss090' - elif loweruri == 'http://purl.org/rss/1.0/': - self.version = 'rss10' - elif loweruri == 'http://www.w3.org/2005/atom': - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') != -1: + if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: + self.version = 'rss090' + if loweruri == 'http://purl.org/rss/1.0/' and not self.version: + self.version = 'rss10' + if loweruri == 'http://www.w3.org/2005/atom' and not self.version: + self.version = 'atom10' + if loweruri.find('backend.userland.com/rss') <> -1: # match any backend.userland.com namespace uri = 'http://backend.userland.com/rss' loweruri = uri - if loweruri in self._matchnamespaces: + if self._matchnamespaces.has_key(loweruri): self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri else: @@ -827,83 +591,40 @@ class _FeedParserMixin: def resolveURI(self, uri): return _urljoin(self.baseuri or '', uri) - + def decodeEntities(self, element, data): return data - def strattrs(self, attrs): - return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) - def push(self, element, expectingText): self.elementstack.append([element, expectingText, []]) def pop(self, element, stripWhitespace=1): - if not self.elementstack: - return - if self.elementstack[-1][0] != element: - return - + if not self.elementstack: return + if self.elementstack[-1][0] != element: return + element, expectingText, pieces = self.elementstack.pop() - - if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml': - # remove enclosing child element, but only if it is a
and - # only if all the remaining content is nested underneath it. - # This means that the divs would be retained in the following: - #
foo
bar
- while pieces and len(pieces)>1 and not pieces[-1].strip(): - del pieces[-1] - while pieces and len(pieces)>1 and not pieces[0].strip(): - del pieces[0] - if pieces and (pieces[0] == '
' or pieces[0].startswith('
': - depth = 0 - for piece in pieces[:-1]: - if piece.startswith(''): - depth += 1 - else: - pieces = pieces[1:-1] - - # Ensure each piece is a str for Python 3 - for (i, v) in enumerate(pieces): - if not isinstance(v, str): - pieces[i] = v.decode('utf-8') - output = ''.join(pieces) if stripWhitespace: output = output.strip() - if not expectingText: - return output + if not expectingText: return output # decode base64 content if base64 and self.contentparams.get('base64', 0): try: - output = _base64decode(output) + output = base64.decodestring(output) except binascii.Error: pass except binascii.Incomplete: pass - except TypeError: - # In Python 3, base64 takes and outputs bytes, not str - # This may not be the most correct way to accomplish this - output = _base64decode(output.encode('utf-8')).decode('utf-8') - + # resolve relative URIs if (element in self.can_be_relative_uri) and output: output = self.resolveURI(output) - + # decode entities within embedded markup if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) - # some feed formats require consumers to guess - # whether the content is html or plain text - if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain': - if self.lookslikehtml(output): - self.contentparams['type'] = 'text/html' - # remove temporary cruft from contentparams try: del self.contentparams['mode'] @@ -914,55 +635,26 @@ class _FeedParserMixin: except KeyError: pass - is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types # resolve relative URIs within embedded markup - if is_htmlish and RESOLVE_RELATIVE_URIS: + if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: if element in self.can_contain_relative_uris: - output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) - - # parse microformats - # (must do this before sanitizing because some microformats - # rely on elements that we sanitize) - if PARSE_MICROFORMATS and is_htmlish and element in ['content', 'description', 'summary']: - mfresults = _parseMicroformats(output, self.baseuri, self.encoding) - if mfresults: - for tag in mfresults.get('tags', []): - self._addTag(tag['term'], tag['scheme'], tag['label']) - for enclosure in mfresults.get('enclosures', []): - self._start_enclosure(enclosure) - for xfn in mfresults.get('xfn', []): - self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) - vcard = mfresults.get('vcard') - if vcard: - self._getContext()['vcard'] = vcard - + output = _resolveRelativeURIs(output, self.baseuri, self.encoding) + # sanitize embedded markup - if is_htmlish and SANITIZE_HTML: + if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: if element in self.can_contain_dangerous_markup: - output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) + output = _sanitizeHTML(output, self.encoding) - if self.encoding and not isinstance(output, str): - output = output.decode(self.encoding, 'ignore') - - # address common error where people take data that is already - # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and isinstance(output, str): + if self.encoding and type(output) != type(u''): try: - output = output.encode('iso-8859-1').decode('utf-8') - except (UnicodeEncodeError, UnicodeDecodeError): + output = unicode(output, self.encoding) + except: pass - # map win-1252 extensions to the proper code points - if isinstance(output, str): - output = output.translate(_cp1252) - # categories/tags/keywords/whatever are handled in _end_category if element == 'category': return output - - if element == 'title' and -1 < self.title_depth <= self.depth: - return output - + # store output in appropriate place(s) if self.inentry and not self.insource: if element == 'content': @@ -971,34 +663,23 @@ class _FeedParserMixin: contentparams['value'] = output self.entries[-1][element].append(contentparams) elif element == 'link': - if not self.inimage: - # query variables in urls in link elements are improperly - # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're - # unhandled character references. fix this special case. - output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) - self.entries[-1][element] = output - if output: - self.entries[-1]['links'][-1]['href'] = output + self.entries[-1][element] = output + if output: + self.entries[-1]['links'][-1]['href'] = output else: if element == 'description': element = 'summary' - old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) - if old_value_depth is None or self.depth <= old_value_depth: - self.property_depth_map[self.entries[-1]][element] = self.depth - self.entries[-1][element] = output + self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output self.entries[-1][element + '_detail'] = contentparams - elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): + elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): context = self._getContext() if element == 'description': element = 'subtitle' context[element] = output if element == 'link': - # fix query variables; see above for the explanation - output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) - context[element] = output context['links'][-1]['href'] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) @@ -1008,8 +689,6 @@ class _FeedParserMixin: def pushContent(self, tag, attrsD, defaultContentType, expectingText): self.incontent += 1 - if self.lang: - self.lang=self.lang.replace('_','-') self.contentparams = FeedParserDict({ 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 'language': self.lang, @@ -1022,36 +701,16 @@ class _FeedParserMixin: self.incontent -= 1 self.contentparams.clear() return value - - # a number of elements in a number of RSS variants are nominally plain - # text, but this is routinely ignored. This is an attempt to detect - # the most common cases. As false positives often result in silent - # data loss, this function errs on the conservative side. - @staticmethod - def lookslikehtml(s): - # must have a close tag or an entity reference to qualify - if not (re.search(r'',s) or re.search("&#?\w+;",s)): - return - - # all tags must be in a restricted subset of valid HTML tags - if [t for t in re.findall(r' -1: prefix = name[:colonpos] suffix = name[colonpos+1:] prefix = self.namespacemap.get(prefix, prefix) name = prefix + ':' + suffix return name - + def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) @@ -1079,23 +738,17 @@ class _FeedParserMixin: pass attrsD['href'] = href return attrsD - - def _save(self, key, value, overwrite=False): + + def _save(self, key, value): context = self._getContext() - if overwrite: - context[key] = value - else: - context.setdefault(key, value) + context.setdefault(key, value) def _start_rss(self, attrsD): versionmap = {'0.91': 'rss091u', '0.92': 'rss092', '0.93': 'rss093', '0.94': 'rss094'} - #If we're here then this is an RSS feed. - #If we don't have a version or have a version that starts with something - #other than RSS then there's been a mistake. Correct it. - if not self.version or not self.version.startswith('rss'): + if not self.version: attr_version = attrsD.get('version', '') version = versionmap.get(attr_version) if version: @@ -1104,21 +757,25 @@ class _FeedParserMixin: self.version = 'rss20' else: self.version = 'rss' + + def _start_dlhottitles(self, attrsD): + self.version = 'hotrss' def _start_channel(self, attrsD): self.infeed = 1 self._cdf_common(attrsD) + _start_feedinfo = _start_channel def _cdf_common(self, attrsD): - if 'lastmod' in attrsD: + if attrsD.has_key('lastmod'): self._start_modified({}) self.elementstack[-1][-1] = attrsD['lastmod'] self._end_modified() - if 'href' in attrsD: + if attrsD.has_key('href'): self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() - + def _start_feed(self, attrsD): self.infeed = 1 versionmap = {'0.1': 'atom01', @@ -1135,27 +792,24 @@ class _FeedParserMixin: def _end_channel(self): self.infeed = 0 _end_feed = _end_channel - + def _start_image(self, attrsD): - context = self._getContext() - if not self.inentry: - context.setdefault('image', FeedParserDict()) self.inimage = 1 - self.title_depth = -1 self.push('image', 0) - + context = self._getContext() + context.setdefault('image', FeedParserDict()) + def _end_image(self): self.pop('image') self.inimage = 0 def _start_textinput(self, attrsD): + self.intextinput = 1 + self.push('textinput', 0) context = self._getContext() context.setdefault('textinput', FeedParserDict()) - self.intextinput = 1 - self.title_depth = -1 - self.push('textinput', 0) _start_textInput = _start_textinput - + def _end_textinput(self): self.pop('textinput') self.intextinput = 0 @@ -1164,10 +818,6 @@ class _FeedParserMixin: def _start_author(self, attrsD): self.inauthor = 1 self.push('author', 1) - # Append a new FeedParserDict when expecting an author - context = self._getContext() - context.setdefault('authors', []) - context['authors'].append(FeedParserDict()) _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author @@ -1227,7 +877,7 @@ class _FeedParserMixin: self._save_contributor('name', value) elif self.intextinput: context = self._getContext() - context['name'] = value + context['textinput']['name'] = value _end_itunes_name = _end_name def _start_width(self, attrsD): @@ -1237,11 +887,11 @@ class _FeedParserMixin: value = self.pop('width') try: value = int(value) - except ValueError: + except: value = 0 if self.inimage: context = self._getContext() - context['width'] = value + context['image']['width'] = value def _start_height(self, attrsD): self.push('height', 0) @@ -1250,11 +900,11 @@ class _FeedParserMixin: value = self.pop('height') try: value = int(value) - except ValueError: + except: value = 0 if self.inimage: context = self._getContext() - context['height'] = value + context['image']['height'] = value def _start_url(self, attrsD): self.push('href', 1) @@ -1267,6 +917,12 @@ class _FeedParserMixin: self._save_author('href', value) elif self.incontributor: self._save_contributor('href', value) + elif self.inimage: + context = self._getContext() + context['image']['href'] = value + elif self.intextinput: + context = self._getContext() + context['textinput']['link'] = value _end_homepage = _end_url _end_uri = _end_url @@ -1287,10 +943,6 @@ class _FeedParserMixin: def _getContext(self): if self.insource: context = self.sourcedata - elif self.inimage and 'image' in self.feeddata: - context = self.feeddata['image'] - elif self.intextinput: - context = self.feeddata['textinput'] elif self.inentry: context = self.entries[-1] else: @@ -1302,8 +954,6 @@ class _FeedParserMixin: context.setdefault(prefix + '_detail', FeedParserDict()) context[prefix + '_detail'][key] = value self._sync_author_detail() - context.setdefault('authors', [FeedParserDict()]) - context['authors'][-1][key] = value def _save_contributor(self, key, value): context = self._getContext() @@ -1323,29 +973,23 @@ class _FeedParserMixin: elif email: context[key] = email else: - author, email = context.get(key), None - if not author: - return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) - if emailmatch: - email = emailmatch.group(0) - # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') - author = author.replace('<>', '') - author = author.replace('<>', '') - author = author.strip() - if author and (author[0] == '('): - author = author[1:] - if author and (author[-1] == ')'): - author = author[:-1] - author = author.strip() - if author or email: - context.setdefault('%s_detail' % key, FeedParserDict()) - if author: - context['%s_detail' % key]['name'] = author - if email: - context['%s_detail' % key]['email'] = email + author = context.get(key) + if not author: return + emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) + if not emailmatch: return + email = emailmatch.group(0) + # probably a better way to do the following, but it passes all the tests + author = author.replace(email, '') + author = author.replace('()', '') + author = author.strip() + if author and (author[0] == '('): + author = author[1:] + if author and (author[-1] == ')'): + author = author[:-1] + author = author.strip() + context.setdefault('%s_detail' % key, FeedParserDict()) + context['%s_detail' % key]['name'] = author + context['%s_detail' % key]['email'] = email def _start_subtitle(self, attrsD): self.pushContent('subtitle', attrsD, 'text/plain', 1) @@ -1356,7 +1000,7 @@ class _FeedParserMixin: self.popContent('subtitle') _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - + def _start_rights(self, attrsD): self.pushContent('rights', attrsD, 'text/plain', 1) _start_dc_rights = _start_rights @@ -1372,13 +1016,13 @@ class _FeedParserMixin: self.push('item', 0) self.inentry = 1 self.guidislink = 0 - self.title_depth = -1 id = self._getAttribute(attrsD, 'rdf:about') if id: context = self._getContext() context['id'] = id self._cdf_common(attrsD) _start_entry = _start_item + _start_product = _start_item def _end_item(self): self.pop('item') @@ -1406,30 +1050,28 @@ class _FeedParserMixin: self.push('published', 1) _start_dcterms_issued = _start_published _start_issued = _start_published - _start_pubdate = _start_published def _end_published(self): value = self.pop('published') - self._save('published_parsed', _parse_date(value), overwrite=True) + self._save('published_parsed', _parse_date(value)) _end_dcterms_issued = _end_published _end_issued = _end_published - _end_pubdate = _end_published def _start_updated(self, attrsD): self.push('updated', 1) _start_modified = _start_updated _start_dcterms_modified = _start_updated + _start_pubdate = _start_updated _start_dc_date = _start_updated - _start_lastbuilddate = _start_updated def _end_updated(self): value = self.pop('updated') parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value, overwrite=True) + self._save('updated_parsed', parsed_value) _end_modified = _end_updated _end_dcterms_modified = _end_updated + _end_pubdate = _end_updated _end_dc_date = _end_updated - _end_lastbuilddate = _end_updated def _start_created(self, attrsD): self.push('created', 1) @@ -1437,56 +1079,38 @@ class _FeedParserMixin: def _end_created(self): value = self.pop('created') - self._save('created_parsed', _parse_date(value), overwrite=True) + self._save('created_parsed', _parse_date(value)) _end_dcterms_created = _end_created def _start_expirationdate(self, attrsD): self.push('expired', 1) def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) + self._save('expired_parsed', _parse_date(self.pop('expired'))) def _start_cc_license(self, attrsD): - context = self._getContext() + self.push('license', 1) value = self._getAttribute(attrsD, 'rdf:resource') - attrsD = FeedParserDict() - attrsD['rel'] = 'license' if value: - attrsD['href']=value - context.setdefault('links', []).append(attrsD) - + self.elementstack[-1][2].append(value) + self.pop('license') + def _start_creativecommons_license(self, attrsD): self.push('license', 1) - _start_creativeCommons_license = _start_creativecommons_license def _end_creativecommons_license(self): - value = self.pop('license') - context = self._getContext() - attrsD = FeedParserDict() - attrsD['rel'] = 'license' - if value: - attrsD['href'] = value - context.setdefault('links', []).append(attrsD) - del context['license'] - _end_creativeCommons_license = _end_creativecommons_license - - def _addXFN(self, relationships, href, name): - context = self._getContext() - xfn = context.setdefault('xfn', []) - value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) - if value not in xfn: - xfn.append(value) + self.pop('license') def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): - return + if (not term) and (not scheme) and (not label): return value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) if value not in tags: - tags.append(value) + tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) def _start_category(self, attrsD): + if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) term = attrsD.get('term') scheme = attrsD.get('scheme', attrsD.get('domain')) label = attrsD.get('label') @@ -1494,24 +1118,18 @@ class _FeedParserMixin: self.push('category', 1) _start_dc_subject = _start_category _start_keywords = _start_category - - def _start_media_category(self, attrsD): - attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') - self._start_category(attrsD) - + def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(','): - if term.strip(): - self._addTag(term.strip(), 'http://www.itunes.com/', None) - + for term in self.pop('itunes_keywords').split(): + self._addTag(term, 'http://www.itunes.com/', None) + def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) self.push('category', 1) - + def _end_category(self): value = self.pop('category') - if not value: - return + if not value: return context = self._getContext() tags = context['tags'] if value and len(tags) and not tags[-1]['term']: @@ -1521,78 +1139,73 @@ class _FeedParserMixin: _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category - _end_media_category = _end_category def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) - + def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') - if attrsD['rel'] == 'self': - attrsD.setdefault('type', 'application/atom+xml') - else: - attrsD.setdefault('type', 'text/html') - context = self._getContext() + attrsD.setdefault('type', 'text/html') attrsD = self._itsAnHrefDamnIt(attrsD) - if 'href' in attrsD: + if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) expectingText = self.infeed or self.inentry or self.insource + context = self._getContext() context.setdefault('links', []) - if not (self.inentry and self.inimage): - context['links'].append(FeedParserDict(attrsD)) - if 'href' in attrsD: + context['links'].append(FeedParserDict(attrsD)) + if attrsD['rel'] == 'enclosure': + self._start_enclosure(attrsD) + if attrsD.has_key('href'): expectingText = 0 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] else: self.push('link', expectingText) + _start_producturl = _start_link def _end_link(self): value = self.pop('link') + context = self._getContext() + if self.intextinput: + context['textinput']['link'] = value + if self.inimage: + context['image']['link'] = value + _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') self.push('id', 1) - _start_id = _start_guid def _end_guid(self): value = self.pop('id') - self._save('guidislink', self.guidislink and 'link' not in self._getContext()) + self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element self._save('link', value) - _end_id = _end_guid def _start_title(self, attrsD): - if self.svgOK: - return self.unknown_starttag('title', list(attrsD.items())) self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) _start_dc_title = _start_title _start_media_title = _start_title def _end_title(self): - if self.svgOK: - return value = self.popContent('title') - if not value: - return - self.title_depth = self.depth + context = self._getContext() + if self.intextinput: + context['textinput']['title'] = value + elif self.inimage: + context['image']['title'] = value _end_dc_title = _end_title - - def _end_media_title(self): - title_depth = self.title_depth - self._end_title() - self.title_depth = title_depth + _end_media_title = _end_title def _start_description(self, attrsD): context = self._getContext() - if 'summary' in context: + if context.has_key('summary'): self._summaryKey = 'content' self._start_content(attrsD) else: self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) - _start_dc_description = _start_description def _start_abstract(self, attrsD): self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) @@ -1602,9 +1215,13 @@ class _FeedParserMixin: self._end_content() else: value = self.popContent('description') + context = self._getContext() + if self.intextinput: + context['textinput']['description'] = value + elif self.inimage: + context['image']['description'] = value self._summaryKey = None _end_abstract = _end_description - _end_dc_description = _end_description def _start_info(self, attrsD): self.pushContent('info', attrsD, 'text/plain', 1) @@ -1617,7 +1234,7 @@ class _FeedParserMixin: def _start_generator(self, attrsD): if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) - if 'href' in attrsD: + if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) self._getContext()['generator_detail'] = FeedParserDict(attrsD) self.push('generator', 1) @@ -1625,9 +1242,9 @@ class _FeedParserMixin: def _end_generator(self): value = self.pop('generator') context = self._getContext() - if 'generator_detail' in context: + if context.has_key('generator_detail'): context['generator_detail']['name'] = value - + def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) value = self._getAttribute(attrsD, 'rdf:resource') @@ -1642,10 +1259,10 @@ class _FeedParserMixin: if value: self.elementstack[-1][2].append(value) self.pop('errorreportsto') - + def _start_summary(self, attrsD): context = self._getContext() - if 'summary' in context: + if context.has_key('summary'): self._summaryKey = 'content' self._start_content(attrsD) else: @@ -1660,26 +1277,21 @@ class _FeedParserMixin: self.popContent(self._summaryKey or 'summary') self._summaryKey = None _end_itunes_summary = _end_summary - + def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) - context = self._getContext() - attrsD['rel'] = 'enclosure' - context.setdefault('links', []).append(FeedParserDict(attrsD)) - + self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) + href = attrsD.get('href') + if href: + context = self._getContext() + if not context.get('id'): + context['id'] = href + def _start_source(self, attrsD): - if 'url' in attrsD: - # This means that we're processing a source element from an RSS 2.0 feed - self.sourcedata['href'] = attrsD['url'] - self.push('source', 1) self.insource = 1 - self.title_depth = -1 def _end_source(self): self.insource = 0 - value = self.pop('source') - if value: - self.sourcedata['title'] = value self._getContext()['source'] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() @@ -1690,6 +1302,9 @@ class _FeedParserMixin: self.contentparams['src'] = src self.push('content', 1) + def _start_prodlink(self, attrsD): + self.pushContent('content', attrsD, 'text/html', 1) + def _start_body(self, attrsD): self.pushContent('content', attrsD, 'application/xhtml+xml', 1) _start_xhtml_body = _start_body @@ -1699,95 +1314,45 @@ class _FeedParserMixin: _start_fullitem = _start_content_encoded def _end_content(self): - copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) + copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) value = self.popContent('content') - if copyToSummary: - self._save('summary', value) - + if copyToDescription: + self._save('description', value) _end_body = _end_content _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content + _end_prodlink = _end_content def _start_itunes_image(self, attrsD): self.push('itunes_image', 0) - if attrsD.get('href'): - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) - elif attrsD.get('url'): - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')}) + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) _start_itunes_link = _start_itunes_image - + def _end_itunes_block(self): value = self.pop('itunes_block', 0) self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 def _end_itunes_explicit(self): value = self.pop('itunes_explicit', 0) - # Convert 'yes' -> True, 'clean' to False, and any other value to None - # False and None both evaluate as False, so the difference can be ignored - # by applications that only need to know if the content is explicit. - self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] - - def _start_media_content(self, attrsD): - context = self._getContext() - context.setdefault('media_content', []) - context['media_content'].append(attrsD) - - def _start_media_thumbnail(self, attrsD): - context = self._getContext() - context.setdefault('media_thumbnail', []) - self.push('url', 1) # new - context['media_thumbnail'].append(attrsD) - - def _end_media_thumbnail(self): - url = self.pop('url') - context = self._getContext() - if url != None and len(url.strip()) != 0: - if 'url' not in context['media_thumbnail'][-1]: - context['media_thumbnail'][-1]['url'] = url - - def _start_media_player(self, attrsD): - self.push('media_player', 0) - self._getContext()['media_player'] = FeedParserDict(attrsD) - - def _end_media_player(self): - value = self.pop('media_player') - context = self._getContext() - context['media_player']['content'] = value - - def _start_newlocation(self, attrsD): - self.push('newlocation', 1) - - def _end_newlocation(self): - url = self.pop('newlocation') - context = self._getContext() - # don't set newlocation if the context isn't right - if context is not self.feeddata: - return - context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) + self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): + if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None - self.decls = {} - + def startPrefixMapping(self, prefix, uri): - if not uri: - return - # Jython uses '' instead of None; standardize on None - prefix = prefix or None self.trackNamespace(prefix, uri) - if prefix and uri == 'http://www.w3.org/1999/xlink': - self.decls['xmlns:' + prefix] = uri - + def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') != -1: + if lowernamespace.find('backend.userland.com/rss') <> -1: # match any backend.userland.com namespace namespace = 'http://backend.userland.com/rss' lowernamespace = namespace @@ -1796,9 +1361,12 @@ if _XML_AVAILABLE: else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: - raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) + if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): + raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix + if prefix: + localname = prefix + ':' + localname localname = str(localname).lower() + if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) # qname implementation is horribly broken in Python 2.1 (it # doesn't report any), and slightly broken in Python 2.2 (it @@ -1807,21 +1375,8 @@ if _XML_AVAILABLE: # the qnames the SAX parser gives us (if indeed it gives us any # at all). Thanks to MatejC for helping me test this and # tirelessly telling me that it didn't work yet. - attrsD, self.decls = self.decls, {} - if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': - attrsD['xmlns']=namespace - if localname=='svg' and namespace=='http://www.w3.org/2000/svg': - attrsD['xmlns']=namespace - - if prefix: - localname = prefix.lower() + ':' + localname - elif namespace and not qname: #Expat - for name,value in list(self.namespacesInUse.items()): - if name and value == namespace: - localname = name + ':' + localname - break - - for (namespace, attrlocalname), attrvalue in list(attrs.items()): + attrsD = {} + for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): lowernamespace = (namespace or '').lower() prefix = self._matchnamespaces.get(lowernamespace, '') if prefix: @@ -1829,7 +1384,7 @@ if _XML_AVAILABLE: attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) - self.unknown_starttag(localname, list(attrsD.items())) + self.unknown_starttag(localname, attrsD.items()) def characters(self, text): self.handle_data(text) @@ -1844,39 +1399,26 @@ if _XML_AVAILABLE: prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: localname = prefix + ':' + localname - elif namespace and not qname: #Expat - for name,value in list(self.namespacesInUse.items()): - if name and value == namespace: - localname = name + ':' + localname - break localname = str(localname).lower() self.unknown_endtag(localname) def error(self, exc): self.bozo = 1 self.exc = exc - - # drv_libxml2 calls warning() in some cases - warning = error - + def fatalError(self, exc): self.error(exc) raise exc class _BaseHTMLProcessor(sgmllib.SGMLParser): - special = re.compile('''[<>'"]''') - bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") - elements_no_end_tag = set([ - 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', - 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', - 'source', 'track', 'wbr' - ]) - - def __init__(self, encoding, _type): + elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', + 'img', 'input', 'isindex', 'link', 'meta', 'param'] + + def __init__(self, encoding): self.encoding = encoding - self._type = _type + if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) - + def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) @@ -1887,132 +1429,80 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): return '<' + tag + ' />' else: return '<' + tag + '>' - - # By declaring these methods and overriding their compiled code - # with the code from sgmllib, the original code will execute in - # feedparser's scope instead of sgmllib's. This means that the - # `tagfind` and `charref` regular expressions will be found as - # they're declared above, not as they're declared in sgmllib. - def goahead(self, i): - pass - goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ - - def __parse_starttag(self, i): - pass - __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ - - def parse_starttag(self,i): - j = self.__parse_starttag(i) - if self._type == 'application/xhtml+xml': - if j>2 and self.rawdata[j-2:j]=='/>': - self.unknown_endtag(self.lasttag) - return j - + def feed(self, data): data = re.compile(r'\s]+?)\s*/>', self._shorttag_replace, data) + #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace + data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - try: - bytes - if bytes is str: - raise NameError - self.encoding = self.encoding + '_INVALID_PYTHON_3' - except NameError: - if self.encoding and isinstance(data, str): - data = data.encode(self.encoding) + if self.encoding and type(data) == type(u''): + data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) - sgmllib.SGMLParser.close(self) def normalize_attrs(self, attrs): - if not attrs: - return attrs # utility method to be called by descendants - attrs = list(dict([(k.lower(), v) for k, v in attrs]).items()) + attrs = [(k.lower(), v) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - attrs.sort() return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
         uattrs = []
-        strattrs=''
-        if attrs:
-            for key, value in attrs:
-                value=value.replace('>','>').replace('<','<').replace('"','"')
-                value = self.bare_ampersand.sub("&", value)
-                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-                if not isinstance(value, str):
-                    value = value.decode(self.encoding, 'ignore')
-                try:
-                    # Currently, in Python 3 the key is already a str, and cannot be decoded again
-                    uattrs.append((str(key, self.encoding), value))
-                except TypeError:
-                    uattrs.append((key, value))
-            strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs])
-            if self.encoding:
-                try:
-                    strattrs = strattrs.encode(self.encoding)
-                except (UnicodeEncodeError, LookupError):
-                    pass
+        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+        for key, value in attrs:
+            if type(value) != type(u''):
+                value = unicode(value, self.encoding)
+            uattrs.append((unicode(key, self.encoding), value))
+        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%s%s />' % (tag, strattrs))
+            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
         else:
-            self.pieces.append('<%s%s>' % (tag, strattrs))
+            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
 
     def unknown_endtag(self, tag):
         # called for each end tag, e.g. for 
, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: - self.pieces.append("" % tag) + self.pieces.append("" % locals()) def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. - ref = ref.lower() - if ref.startswith('x'): - value = int(ref[1:], 16) - else: - value = int(ref) - - if value in _cp1252: - self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) - else: - self.pieces.append('&#%s;' % ref) - + self.pieces.append('&#%(ref)s;' % locals()) + def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - if ref in name2codepoint or ref == 'apos': - self.pieces.append('&%s;' % ref) - else: - self.pieces.append('&%s' % ref) + self.pieces.append('&%(ref)s;' % locals()) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. + if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) self.pieces.append(text) - + def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % text) - + self.pieces.append('' % locals()) + def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % text) + self.pieces.append('' % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % text) - + self.pieces.append('' % locals()) + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata @@ -2031,497 +1521,36 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # self.updatepos(declstartpos, i) return None, -1 - def convert_charref(self, name): - return '&#%s;' % name - - def convert_entityref(self, name): - return '&%s;' % name - def output(self): '''Return processed HTML as a single string''' return ''.join([str(p) for p in self.pieces]) - def parse_declaration(self, i): - try: - return sgmllib.SGMLParser.parse_declaration(self, i) - except sgmllib.SGMLParseError: - # escape the doctype declaration and continue parsing - self.handle_data('<') - return i+1 - class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): - def __init__(self, baseuri, baselang, encoding, entities): + def __init__(self, baseuri, baselang, encoding): sgmllib.SGMLParser.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) - _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') - self.entities=entities def decodeEntities(self, element, data): data = data.replace('<', '<') data = data.replace('<', '<') - data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('>', '>') - data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace('"', '"') data = data.replace(''', ''') data = data.replace(''', ''') - if not self.contentparams.get('type', 'xml').endswith('xml'): + if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace(''', "'") return data - - def strattrs(self, attrs): - return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) - -class _MicroformatsParser: - STRING = 1 - DATE = 2 - URI = 3 - NODE = 4 - EMAIL = 5 - - known_xfn_relationships = set(['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']) - known_binary_extensions = set(['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']) - - def __init__(self, data, baseuri, encoding): - self.document = BeautifulSoup.BeautifulSoup(data) - self.baseuri = baseuri - self.encoding = encoding - if isinstance(data, str): - data = data.encode(encoding) - self.tags = [] - self.enclosures = [] - self.xfn = [] - self.vcard = None - - def vcardEscape(self, s): - if isinstance(s, str): - s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') - return s - - def vcardFold(self, s): - s = re.sub(';+$', '', s) - sFolded = '' - iMax = 75 - sPrefix = '' - while len(s) > iMax: - sFolded += sPrefix + s[:iMax] + '\n' - s = s[iMax:] - sPrefix = ' ' - iMax = 74 - sFolded += sPrefix + s - return sFolded - - def normalize(self, s): - return re.sub(r'\s+', ' ', s).strip() - - def unique(self, aList): - results = [] - for element in aList: - if element not in results: - results.append(element) - return results - - def toISO8601(self, dt): - return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) - - def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): - all = lambda x: 1 - sProperty = sProperty.lower() - bFound = 0 - bNormalize = 1 - propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} - if bAllowMultiple and (iPropertyType != self.NODE): - snapResults = [] - containers = elmRoot(['ul', 'ol'], propertyMatch) - for container in containers: - snapResults.extend(container('li')) - bFound = (len(snapResults) != 0) - if not bFound: - snapResults = elmRoot(all, propertyMatch) - bFound = (len(snapResults) != 0) - if (not bFound) and (sProperty == 'value'): - snapResults = elmRoot('pre') - bFound = (len(snapResults) != 0) - bNormalize = not bFound - if not bFound: - snapResults = [elmRoot] - bFound = (len(snapResults) != 0) - arFilter = [] - if sProperty == 'vcard': - snapFilter = elmRoot(all, propertyMatch) - for node in snapFilter: - if node.findParent(all, propertyMatch): - arFilter.append(node) - arResults = [] - for node in snapResults: - if node not in arFilter: - arResults.append(node) - bFound = (len(arResults) != 0) - if not bFound: - if bAllowMultiple: - return [] - elif iPropertyType == self.STRING: - return '' - elif iPropertyType == self.DATE: - return None - elif iPropertyType == self.URI: - return '' - elif iPropertyType == self.NODE: - return None - else: - return None - arValues = [] - for elmResult in arResults: - sValue = None - if iPropertyType == self.NODE: - if bAllowMultiple: - arValues.append(elmResult) - continue - else: - return elmResult - sNodeName = elmResult.name.lower() - if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): - sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'abbr'): - sValue = elmResult.get('title') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (iPropertyType == self.URI): - if sNodeName == 'a': - sValue = elmResult.get('href') - elif sNodeName == 'img': - sValue = elmResult.get('src') - elif sNodeName == 'object': - sValue = elmResult.get('data') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'img'): - sValue = elmResult.get('alt') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: - sValue = elmResult.renderContents() - sValue = re.sub(r'<\S[^>]*>', '', sValue) - sValue = sValue.replace('\r\n', '\n') - sValue = sValue.replace('\r', '\n') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: - continue - if iPropertyType == self.DATE: - sValue = _parse_date_iso8601(sValue) - if bAllowMultiple: - arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) - else: - return bAutoEscape and self.vcardEscape(sValue) or sValue - return arValues - - def findVCards(self, elmRoot, bAgentParsing=0): - sVCards = '' - - if not bAgentParsing: - arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) - else: - arCards = [elmRoot] - - for elmCard in arCards: - arLines = [] - - def processSingleString(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) - return sValue or '' - - def processSingleURI(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.URI) - if sValue: - sContentType = '' - sEncoding = '' - sValueKey = '' - if sValue.startswith('data:'): - sEncoding = ';ENCODING=b' - sContentType = sValue.split(';')[0].split('/').pop() - sValue = sValue.split(',', 1).pop() - else: - elmValue = self.getPropertyValue(elmCard, sProperty) - if elmValue: - if sProperty != 'url': - sValueKey = ';VALUE=uri' - sContentType = elmValue.get('type', '').strip().split('/').pop().strip() - sContentType = sContentType.upper() - if sContentType == 'OCTET-STREAM': - sContentType = '' - if sContentType: - sContentType = ';TYPE=' + sContentType.upper() - arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) - - def processTypeValue(sProperty, arDefaultType, arForceType=None): - arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) - for elmResult in arResults: - arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) - if arForceType: - arType = self.unique(arForceType + arType) - if not arType: - arType = arDefaultType - sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) - - # AGENT - # must do this before all other properties because it is destructive - # (removes nested class="vcard" nodes so they don't interfere with - # this vcard's other properties) - arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) - for elmAgent in arAgent: - if re.compile(r'\bvcard\b').search(elmAgent.get('class')): - sAgentValue = self.findVCards(elmAgent, 1) + '\n' - sAgentValue = sAgentValue.replace('\n', '\\n') - sAgentValue = sAgentValue.replace(';', '\\;') - if sAgentValue: - arLines.append(self.vcardFold('AGENT:' + sAgentValue)) - # Completely remove the agent element from the parse tree - elmAgent.extract() - else: - sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); - if sAgentValue: - arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) - - # FN (full name) - sFN = processSingleString('fn') - - # N (name) - elmName = self.getPropertyValue(elmCard, 'n') - if elmName: - sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) - sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) - arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) - arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) - arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) - arLines.append(self.vcardFold('N:' + sFamilyName + ';' + - sGivenName + ';' + - ','.join(arAdditionalNames) + ';' + - ','.join(arHonorificPrefixes) + ';' + - ','.join(arHonorificSuffixes))) - elif sFN: - # implied "N" optimization - # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization - arNames = self.normalize(sFN).split() - if len(arNames) == 2: - bFamilyNameFirst = (arNames[0].endswith(',') or - len(arNames[1]) == 1 or - ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) - if bFamilyNameFirst: - arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) - else: - arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) - - # SORT-STRING - sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) - if sSortString: - arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) - - # NICKNAME - arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) - if arNickname: - arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) - - # PHOTO - processSingleURI('photo') - - # BDAY - dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) - if dtBday: - arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) - - # ADR (address) - arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) - for elmAdr in arAdr: - arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) - if not arType: - arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 - sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) - sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) - sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) - sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) - sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) - sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) - sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) - arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + - sPostOfficeBox + ';' + - sExtendedAddress + ';' + - sStreetAddress + ';' + - sLocality + ';' + - sRegion + ';' + - sPostalCode + ';' + - sCountryName)) - - # LABEL - processTypeValue('label', ['intl','postal','parcel','work']) - - # TEL (phone number) - processTypeValue('tel', ['voice']) - - # EMAIL - processTypeValue('email', ['internet'], ['internet']) - - # MAILER - processSingleString('mailer') - - # TZ (timezone) - processSingleString('tz') - - # GEO (geographical information) - elmGeo = self.getPropertyValue(elmCard, 'geo') - if elmGeo: - sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) - sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) - arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) - - # TITLE - processSingleString('title') - - # ROLE - processSingleString('role') - - # LOGO - processSingleURI('logo') - - # ORG (organization) - elmOrg = self.getPropertyValue(elmCard, 'org') - if elmOrg: - sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) - if not sOrganizationName: - # implied "organization-name" optimization - # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization - sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) - if sOrganizationName: - arLines.append(self.vcardFold('ORG:' + sOrganizationName)) - else: - arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) - arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) - - # CATEGORY - arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) - if arCategory: - arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) - - # NOTE - processSingleString('note') - - # REV - processSingleString('rev') - - # SOUND - processSingleURI('sound') - - # UID - processSingleString('uid') - - # URL - processSingleURI('url') - - # CLASS - processSingleString('class') - - # KEY - processSingleURI('key') - - if arLines: - arLines = ['BEGIN:vCard','VERSION:3.0'] + arLines + ['END:vCard'] - # XXX - this is super ugly; properly fix this with issue 148 - for i, s in enumerate(arLines): - if not isinstance(s, str): - arLines[i] = s.decode('utf-8', 'ignore') - sVCards += '\n'.join(arLines) + '\n' - - return sVCards.strip() - - def isProbablyDownloadable(self, elm): - attrsD = elm.attrMap - if 'href' not in attrsD: - return 0 - linktype = attrsD.get('type', '').strip() - if linktype.startswith('audio/') or \ - linktype.startswith('video/') or \ - (linktype.startswith('application/') and not linktype.endswith('xml')): - return 1 - try: - path = urllib.parse.urlparse(attrsD['href'])[2] - except ValueError: - return 0 - if path.find('.') == -1: - return 0 - fileext = path.split('.').pop().lower() - return fileext in self.known_binary_extensions - - def findTags(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): - href = elm.get('href') - if not href: - continue - urlscheme, domain, path, params, query, fragment = \ - urllib.parse.urlparse(_urljoin(self.baseuri, href)) - segments = path.split('/') - tag = segments.pop() - if not tag: - if segments: - tag = segments.pop() - else: - # there are no tags - continue - tagscheme = urllib.parse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) - if not tagscheme.endswith('/'): - tagscheme += '/' - self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) - - def findEnclosures(self): - all = lambda x: 1 - enclosure_match = re.compile(r'\benclosure\b') - for elm in self.document(all, {'href': re.compile(r'.+')}): - if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): - continue - if elm.attrMap not in self.enclosures: - self.enclosures.append(elm.attrMap) - if elm.string and not elm.get('title'): - self.enclosures[-1]['title'] = elm.string - - def findXFN(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): - rels = elm.get('rel', '').split() - xfn_rels = [r for r in rels if r in self.known_xfn_relationships] - if xfn_rels: - self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) - -def _parseMicroformats(htmlSource, baseURI, encoding): - if not BeautifulSoup: - return - try: - p = _MicroformatsParser(htmlSource, baseURI, encoding) - except UnicodeEncodeError: - # sgmllib throws this exception when performing lookups of tags - # with non-ASCII characters in them. - return - p.vcard = p.findVCards(p.document) - p.findTags() - p.findEnclosures() - p.findXFN() - return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} - + class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = set([('a', 'href'), + relative_uris = [('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('blockquote', 'cite'), @@ -2545,259 +1574,67 @@ class _RelativeURIResolver(_BaseHTMLProcessor): ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), - ('script', 'src'), - ('video', 'poster')]) + ('script', 'src')] - def __init__(self, baseuri, encoding, _type): - _BaseHTMLProcessor.__init__(self, encoding, _type) + def __init__(self, baseuri, encoding): + _BaseHTMLProcessor.__init__(self, encoding) self.baseuri = baseuri def resolveURI(self, uri): - return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) - + return _urljoin(self.baseuri, uri) + def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - -def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): - if not _SGML_AVAILABLE: - return htmlSource - - p = _RelativeURIResolver(baseURI, encoding, _type) + +def _resolveRelativeURIs(htmlSource, baseURI, encoding): + if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') + p = _RelativeURIResolver(baseURI, encoding) p.feed(htmlSource) return p.output() -def _makeSafeAbsoluteURI(base, rel=None): - # bail if ACCEPTABLE_URI_SCHEMES is empty - if not ACCEPTABLE_URI_SCHEMES: - try: - return _urljoin(base, rel or '') - except ValueError: - return '' - if not base: - return rel or '' - if not rel: - try: - scheme = urllib.parse.urlparse(base)[0] - except ValueError: - return '' - if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: - return base - return '' - try: - uri = _urljoin(base, rel) - except ValueError: - return '' - if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: - return '' - return uri - class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area', - 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', - 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', - 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', - 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', - 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', - 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', - 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', - 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', - 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', - 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', - 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', + 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', + 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', + 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', + 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', + 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', + 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', + 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', - 'background', 'balance', 'bgcolor', 'bgproperties', 'border', - 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', - 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', - 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', - 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', - 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', - 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', - 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', - 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', - 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', - 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', - 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', - 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', - 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel', - 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', - 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', - 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', - 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', - 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', - 'width', 'wrap', 'xml:lang']) + acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', + 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', + 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', + 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', + 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', + 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', + 'usemap', 'valign', 'value', 'vspace', 'width'] - unacceptable_elements_with_end_tag = set(['script', 'applet', 'style']) - - acceptable_css_properties = set(['azimuth', 'background-color', - 'border-bottom-color', 'border-collapse', 'border-color', - 'border-left-color', 'border-right-color', 'border-top-color', 'clear', - 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', - 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', - 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', - 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', - 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', - 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', - 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', - 'white-space', 'width']) - - # survey of common keywords found in feeds - acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue', - 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', - 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', - 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', - 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', - 'transparent', 'underline', 'white', 'yellow']) - - valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + - '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') - - mathml_elements = set(['annotation', 'annotation-xml', 'maction', 'math', - 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', - 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', - 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', - 'munderover', 'none', 'semantics']) - - mathml_attributes = set(['actiontype', 'align', 'columnalign', 'columnalign', - 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', - 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', - 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', - 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', - 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', - 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', - 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', - 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']) - - # svgtiny - foreignObject + linearGradient + radialGradient + stop - svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', - 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', - 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', - 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', - 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', - 'svg', 'switch', 'text', 'title', 'tspan', 'use']) - - # svgtiny + class + opacity + offset + xmlns + xmlns:xlink - svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic', - 'arabic-form', 'ascent', 'attributeName', 'attributeType', - 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', - 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', - 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', - 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', - 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', - 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', - 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', - 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', - 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', - 'min', 'name', 'offset', 'opacity', 'orient', 'origin', - 'overline-position', 'overline-thickness', 'panose-1', 'path', - 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', - 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', - 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', - 'stop-color', 'stop-opacity', 'strikethrough-position', - 'strikethrough-thickness', 'stroke', 'stroke-dasharray', - 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', - 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', - 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', - 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', - 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', - 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', - 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', - 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', - 'y2', 'zoomAndPan']) - - svg_attr_map = None - svg_elem_map = None - - acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule', - 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', - 'stroke-opacity']) + unacceptable_elements_with_end_tag = ['script', 'applet'] def reset(self): _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 - self.mathmlOK = 0 - self.svgOK = 0 - + def unknown_starttag(self, tag, attrs): - acceptable_attributes = self.acceptable_attributes - keymap = {} - if not tag in self.acceptable_elements or self.svgOK: + if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 - - # add implicit namespaces to html5 inline svg/mathml - if self._type.endswith('html'): - if not dict(attrs).get('xmlns'): - if tag=='svg': - attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) - if tag=='math': - attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') ) - - # not otherwise acceptable, perhaps it is MathML or SVG? - if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: - self.mathmlOK += 1 - if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: - self.svgOK += 1 - - # chose acceptable attributes based on tag class, else bail - if self.mathmlOK and tag in self.mathml_elements: - acceptable_attributes = self.mathml_attributes - elif self.svgOK and tag in self.svg_elements: - # for most vocabularies, lowercasing is a good idea. Many - # svg elements, however, are camel case - if not self.svg_attr_map: - lower=[attr.lower() for attr in self.svg_attributes] - mix=[a for a in self.svg_attributes if a not in lower] - self.svg_attributes = lower - self.svg_attr_map = dict([(a.lower(),a) for a in mix]) - - lower=[attr.lower() for attr in self.svg_elements] - mix=[a for a in self.svg_elements if a not in lower] - self.svg_elements = lower - self.svg_elem_map = dict([(a.lower(),a) for a in mix]) - acceptable_attributes = self.svg_attributes - tag = self.svg_elem_map.get(tag,tag) - keymap = self.svg_attr_map - elif not tag in self.acceptable_elements: - return - - # declare xlink namespace, if needed - if self.mathmlOK or self.svgOK: - if [n_v for n_v in attrs if n_v[0].startswith('xlink:')]: - if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: - attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) - - clean_attrs = [] - for key, value in self.normalize_attrs(attrs): - if key in acceptable_attributes: - key=keymap.get(key,key) - # make sure the uri uses an acceptable uri scheme - if key == 'href': - value = _makeSafeAbsoluteURI(value) - clean_attrs.append((key,value)) - elif key=='style': - clean_value = self.sanitize_style(value) - if clean_value: - clean_attrs.append((key,clean_value)) - _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) - + return + attrs = self.normalize_attrs(attrs) + attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] + _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) + def unknown_endtag(self, tag): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 - if self.mathmlOK and tag in self.mathml_elements: - if tag == 'math' and self.mathmlOK: - self.mathmlOK -= 1 - elif self.svgOK and tag in self.svg_elements: - tag = self.svg_elem_map.get(tag,tag) - if tag == 'svg' and self.svgOK: - self.svgOK -= 1 - else: - return + return _BaseHTMLProcessor.unknown_endtag(self, tag) def handle_pi(self, text): @@ -2810,53 +1647,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor): if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) - def sanitize_style(self, style): - # disallow urls - style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) - - # gauntlet - if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): - return '' - # This replaced a regexp that used re.match and was prone to pathological back-tracking. - if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): - return '' - - clean = [] - for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): - if not value: - continue - if prop.lower() in self.acceptable_css_properties: - clean.append(prop + ': ' + value + ';') - elif prop.split('-')[0].lower() in ['background','border','margin','padding']: - for keyword in value.split(): - if not keyword in self.acceptable_css_keywords and \ - not self.valid_css_values.match(keyword): - break - else: - clean.append(prop + ': ' + value + ';') - elif self.svgOK and prop.lower() in self.acceptable_svg_properties: - clean.append(prop + ': ' + value + ';') - - return ' '.join(clean) - - def parse_comment(self, i, report=1): - ret = _BaseHTMLProcessor.parse_comment(self, i, report) - if ret >= 0: - return ret - # if ret == -1, this may be a malicious attempt to circumvent - # sanitization, or a page-destroying unclosed comment - match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) - if match: - return match.end() - # unclosed comment; deliberately fail to handle_data() - return len(self.rawdata) - - -def _sanitizeHTML(htmlSource, encoding, _type): - if not _SGML_AVAILABLE: - return htmlSource - p = _HTMLSanitizer(encoding, _type) - htmlSource = htmlSource.replace(''): @@ -2894,50 +1686,61 @@ def _sanitizeHTML(htmlSource, encoding, _type): data = data.strip().replace('\r\n', '\n') return data -class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler): +class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): def http_error_default(self, req, fp, code, msg, headers): - # The default implementation just raises HTTPError. - # Forget that. - fp.status = code - return fp + if ((code / 100) == 3) and (code != 304): + return self.http_error_302(req, fp, code, msg, headers) + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) + infourl.status = code + return infourl - def http_error_301(self, req, fp, code, msg, hdrs): - result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, - code, msg, hdrs) - result.status = code - result.newurl = result.geturl() - return result - # The default implementations in urllib2.HTTPRedirectHandler - # are identical, so hardcoding a http_error_301 call above - # won't affect anything - http_error_300 = http_error_301 - http_error_302 = http_error_301 - http_error_303 = http_error_301 - http_error_307 = http_error_301 + def http_error_302(self, req, fp, code, msg, headers): + if headers.dict.has_key('location'): + infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) + else: + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) + if not hasattr(infourl, 'status'): + infourl.status = code + return infourl + def http_error_301(self, req, fp, code, msg, headers): + if headers.dict.has_key('location'): + infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) + else: + infourl = urllib.addinfourl(fp, headers, req.get_full_url()) + if not hasattr(infourl, 'status'): + infourl.status = code + return infourl + + http_error_300 = http_error_302 + http_error_303 = http_error_302 + http_error_307 = http_error_302 + def http_error_401(self, req, fp, code, msg, headers): # Check if # - server requires digest auth, AND # - we tried (unsuccessfully) with basic auth, AND + # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) # If all conditions hold, parse authentication information # out of the Authorization header we sent the first time # (for the username and password) and the WWW-Authenticate # header the server sent back (for the realm) and retry # the request with the appropriate digest auth headers instead. # This evil genius hack has been brought to you by Aaron Swartz. - host = urllib.parse.urlparse(req.get_full_url())[1] - if base64 is None or 'Authorization' not in req.headers \ - or 'WWW-Authenticate' not in headers: + host = urlparse.urlparse(req.get_full_url())[1] + try: + assert sys.version.split()[0] >= '2.3.3' + assert base64 != None + user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') + realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] + self.add_password(realm, host, user, passw) + retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) + self.reset_retry_count() + return retry + except: return self.http_error_default(req, fp, code, msg, headers) - auth = _base64decode(req.headers['Authorization'].split(' ')[1]) - user, passw = auth.split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] - self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) - self.reset_retry_count() - return retry -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): +def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -2949,12 +1752,10 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If the etag argument is supplied, it will be used as the value of an If-None-Match request header. - If the modified argument is supplied, it can be a tuple of 9 integers - (as returned by gmtime() in the standard Python time module) or a date - string in any format supported by feedparser. Regardless, it MUST - be in GMT (Greenwich Mean Time). It will be reformatted into an - RFC 1123-compliant date and used as the value of an If-Modified-Since - request header. + If the modified argument is supplied, it must be a tuple of 9 integers + as returned by gmtime() in the standard Python time module. This MUST + be in GMT (Greenwich Mean Time). The formatted date/time will be used + as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. @@ -2964,132 +1765,76 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If handlers is supplied, it is a list of handlers used to build a urllib2 opener. - - if request_headers is supplied it is a dictionary of HTTP request headers - that will override the values generated by FeedParser. """ if hasattr(url_file_stream_or_string, 'read'): return url_file_stream_or_string - if isinstance(url_file_stream_or_string, str) \ - and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): - # Deal with the feed URI scheme - if url_file_stream_or_string.startswith('feed:http'): - url_file_stream_or_string = url_file_stream_or_string[5:] - elif url_file_stream_or_string.startswith('feed:'): - url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] + if url_file_stream_or_string == '-': + return sys.stdin + + if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): if not agent: agent = USER_AGENT - # Test for inline user:password credentials for HTTP basic auth + # test for inline user:password for basic auth auth = None - if base64 and not url_file_stream_or_string.startswith('ftp:'): - urltype, rest = urllib.parse.splittype(url_file_stream_or_string) - realhost, rest = urllib.parse.splithost(rest) + if base64: + urltype, rest = urllib.splittype(url_file_stream_or_string) + realhost, rest = urllib.splithost(rest) if realhost: - user_passwd, realhost = urllib.parse.splituser(realhost) + user_passwd, realhost = urllib.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) - auth = base64.standard_b64encode(user_passwd).strip() - - # iri support - if isinstance(url_file_stream_or_string, str): - url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string) - + auth = base64.encodestring(user_passwd).strip() # try to open with urllib2 (to use optional headers) - request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) - opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()])) + request = urllib2.Request(url_file_stream_or_string) + request.add_header('User-Agent', agent) + if etag: + request.add_header('If-None-Match', etag) + if modified: + # format into an RFC 1123-compliant timestamp. We can't use + # time.strftime() since the %a and %b directives can be affected + # by the current locale, but RFC 2616 states that dates must be + # in English. + short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + if referrer: + request.add_header('Referer', referrer) + if gzip and zlib: + request.add_header('Accept-encoding', 'gzip, deflate') + elif gzip: + request.add_header('Accept-encoding', 'gzip') + elif zlib: + request.add_header('Accept-encoding', 'deflate') + else: + request.add_header('Accept-encoding', '') + if auth: + request.add_header('Authorization', 'Basic %s' % auth) + if ACCEPT_HEADER: + request.add_header('Accept', ACCEPT_HEADER) + request.add_header('A-IM', 'feed') # RFC 3229 support + opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) finally: opener.close() # JohnD - + # try to open with native open function (if url_file_stream_or_string is a filename) try: - return open(url_file_stream_or_string, 'rb') - except (IOError, UnicodeEncodeError, TypeError): - # if url_file_stream_or_string is a unicode object that - # cannot be converted to the encoding returned by - # sys.getfilesystemencoding(), a UnicodeEncodeError - # will be thrown - # If url_file_stream_or_string is a string that contains NULL - # (such as an XML document encoded in UTF-32), TypeError will - # be thrown. + return open(url_file_stream_or_string) + except: pass # treat url_file_stream_or_string as string - if isinstance(url_file_stream_or_string, str): - return _StringIO(url_file_stream_or_string.encode('utf-8')) - return _StringIO(url_file_stream_or_string) - -def _convert_to_idn(url): - """Convert a URL to IDN notation""" - # this function should only be called with a unicode string - # strategy: if the host cannot be encoded in ascii, then - # it'll be necessary to encode it in idn form - parts = list(urllib.parse.urlsplit(url)) - try: - parts[1].encode('ascii') - except UnicodeEncodeError: - # the url needs to be converted to idn notation - host = parts[1].rsplit(':', 1) - newhost = [] - port = '' - if len(host) == 2: - port = host.pop() - for h in host[0].split('.'): - newhost.append(h.encode('idna').decode('utf-8')) - parts[1] = '.'.join(newhost) - if port: - parts[1] += ':' + port - return urllib.parse.urlunsplit(parts) - else: - return url - -def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): - request = urllib.request.Request(url) - request.add_header('User-Agent', agent) - if etag: - request.add_header('If-None-Match', etag) - if isinstance(modified, str): - modified = _parse_date(modified) - elif isinstance(modified, datetime.datetime): - modified = modified.utctimetuple() - if modified: - # format into an RFC 1123-compliant timestamp. We can't use - # time.strftime() since the %a and %b directives can be affected - # by the current locale, but RFC 2616 states that dates must be - # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) - if referrer: - request.add_header('Referer', referrer) - if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') - elif gzip: - request.add_header('Accept-encoding', 'gzip') - elif zlib: - request.add_header('Accept-encoding', 'deflate') - else: - request.add_header('Accept-encoding', '') - if auth: - request.add_header('Authorization', 'Basic %s' % auth) - if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - # use this for whatever -- cookies, special headers, etc - # [('Cookie','Something'),('x-special-header','Another Value')] - for header_name, header_value in list(request_headers.items()): - request.add_header(header_name, header_value) - request.add_header('A-IM', 'feed') # RFC 3229 support - return request + return _StringIO(str(url_file_stream_or_string)) _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' _date_handlers.insert(0, func) - + # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition @@ -3099,8 +1844,8 @@ def registerDateHandler(func): # 0301-04-01), so we use templates instead. # Please note the order in templates is significant because we need a # greedy match. -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', +_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', + 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', '-YY-?MM', '-OOO', '-YY', '--MM-?DD', '--MM', '---DD', @@ -3115,29 +1860,19 @@ _iso8601_re = [ 'CC', r'(?P\d\d$)') + r'(T?(?P\d{2}):(?P\d{2})' + r'(:(?P\d{2}))?' - + r'(\.(?P\d+))?' + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl] -try: - del tmpl -except NameError: - pass +del tmpl _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -try: - del regex -except NameError: - pass +del regex def _parse_date_iso8601(dateString): '''Parse a variety of ISO-8601-compatible formats like 20040105''' m = None for _iso8601_match in _iso8601_matches: m = _iso8601_match(dateString) - if m: - break - if not m: - return - if m.span() == (0, 0): - return + if m: break + if not m: return + if m.span() == (0, 0): return params = m.groupdict() ordinal = params.get('ordinal', 0) if ordinal: @@ -3175,7 +1910,7 @@ def _parse_date_iso8601(dateString): day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params: + if 'century' in params.keys(): year = (int(params['century']) - 1) * 100 + 1 # in ISO 8601 most fields are optional for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: @@ -3183,10 +1918,14 @@ def _parse_date_iso8601(dateString): params[field] = 0 hour = int(params.get('hour', 0)) minute = int(params.get('minute', 0)) - second = int(float(params.get('second', 0))) + second = int(params.get('second', 0)) # weekday is normalized by mktime(), we can ignore it weekday = 0 - daylight_savings_flag = -1 + # daylight savings is complex, but not needed for feedparser's purposes + # as time zones, if specified, include mention of whether it is active + # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and + # and most implementations have DST bugs + daylight_savings_flag = 0 tm = [year, month, day, hour, minute, second, weekday, ordinal, daylight_savings_flag] # ISO 8601 time zone adjustments @@ -3203,39 +1942,38 @@ def _parse_date_iso8601(dateString): # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # which is guaranteed to normalize d/m/y/h/m/s. # Many implementations have bugs, but we'll pretend they don't. - return time.localtime(time.mktime(tuple(tm))) + return time.localtime(time.mktime(tm)) registerDateHandler(_parse_date_iso8601) - + # 8-bit date handling routines written by ytrewq1. -_korean_year = '\ub144' # b3e2 in euc-kr -_korean_month = '\uc6d4' # bff9 in euc-kr -_korean_day = '\uc77c' # c0cf in euc-kr -_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr +_korean_year = u'\ub144' # b3e2 in euc-kr +_korean_month = u'\uc6d4' # bff9 in euc-kr +_korean_day = u'\uc77c' # c0cf in euc-kr +_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr +_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr _korean_onblog_date_re = \ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ (_korean_year, _korean_month, _korean_day)) _korean_nate_date_re = \ - re.compile('(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ + re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ (_korean_am, _korean_pm)) def _parse_date_onblog(dateString): '''Parse a string according to the OnBlog 8-bit date format''' m = _korean_onblog_date_re.match(dateString) - if not m: - return + if not m: return w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ 'zonediff': '+09:00'} + if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_onblog) def _parse_date_nate(dateString): '''Parse a string according to the Nate 8-bit date format''' m = _korean_nate_date_re.match(dateString) - if not m: - return + if not m: return hour = int(m.group(5)) ampm = m.group(4) if (ampm == _korean_pm): @@ -3247,97 +1985,118 @@ def _parse_date_nate(dateString): {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ 'zonediff': '+09:00'} + if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_nate) +_mssql_date_re = \ + re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') +def _parse_date_mssql(dateString): + '''Parse a string according to the MS SQL date format''' + m = _mssql_date_re.match(dateString) + if not m: return + w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ + {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ + 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ + 'zonediff': '+09:00'} + if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) + return _parse_date_w3dtf(w3dtfdate) +registerDateHandler(_parse_date_mssql) + # Unicode strings for Greek date strings _greek_months = \ { \ - '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 - '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 - '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 - '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 - '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 - '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 - '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 - '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 - '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 - '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 - '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 - '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 - '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 - '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 - '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 - '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 - '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 - '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 - '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 + u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 + u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 + u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 + u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 + u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 + u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 + u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 + u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 + u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 + u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 + u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 + u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 + u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 + u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 + u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 + u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 + u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 + u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 + u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 } _greek_wdays = \ { \ - '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 - '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 - '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 - '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 - '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 - '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 - '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 + u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 + u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 + u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 + u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 + u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 + u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 + u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 } _greek_date_format_re = \ - re.compile('([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') + re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') def _parse_date_greek(dateString): '''Parse a string according to a Greek 8-bit date format.''' m = _greek_date_format_re.match(dateString) - if not m: + if not m: return + try: + wday = _greek_wdays[m.group(1)] + month = _greek_months[m.group(3)] + except: return - wday = _greek_wdays[m.group(1)] - month = _greek_months[m.group(3)] rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ 'zonediff': m.group(8)} + if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) return _parse_date_rfc822(rfc822date) registerDateHandler(_parse_date_greek) # Unicode strings for Hungarian date strings _hungarian_months = \ { \ - 'janu\u00e1r': '01', # e1 in iso-8859-2 - 'febru\u00e1ri': '02', # e1 in iso-8859-2 - 'm\u00e1rcius': '03', # e1 in iso-8859-2 - '\u00e1prilis': '04', # e1 in iso-8859-2 - 'm\u00e1ujus': '05', # e1 in iso-8859-2 - 'j\u00fanius': '06', # fa in iso-8859-2 - 'j\u00falius': '07', # fa in iso-8859-2 - 'augusztus': '08', - 'szeptember': '09', - 'okt\u00f3ber': '10', # f3 in iso-8859-2 - 'november': '11', - 'december': '12', + u'janu\u00e1r': u'01', # e1 in iso-8859-2 + u'febru\u00e1ri': u'02', # e1 in iso-8859-2 + u'm\u00e1rcius': u'03', # e1 in iso-8859-2 + u'\u00e1prilis': u'04', # e1 in iso-8859-2 + u'm\u00e1ujus': u'05', # e1 in iso-8859-2 + u'j\u00fanius': u'06', # fa in iso-8859-2 + u'j\u00falius': u'07', # fa in iso-8859-2 + u'augusztus': u'08', + u'szeptember': u'09', + u'okt\u00f3ber': u'10', # f3 in iso-8859-2 + u'november': u'11', + u'december': u'12', } _hungarian_date_format_re = \ - re.compile('(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') + re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') def _parse_date_hungarian(dateString): '''Parse a string according to a Hungarian 8-bit date format.''' m = _hungarian_date_format_re.match(dateString) - if not m or m.group(2) not in _hungarian_months: - return None - month = _hungarian_months[m.group(2)] - day = m.group(3) - if len(day) == 1: - day = '0' + day - hour = m.group(4) - if len(hour) == 1: - hour = '0' + hour + if not m: return + try: + month = _hungarian_months[m.group(2)] + day = m.group(3) + if len(day) == 1: + day = '0' + day + hour = m.group(4) + if len(hour) == 1: + hour = '0' + hour + except: + return w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ {'year': m.group(1), 'month': month, 'day': day,\ 'hour': hour, 'minute': m.group(5),\ 'zonediff': m.group(6)} + if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_hungarian) @@ -3345,9 +2104,6 @@ registerDateHandler(_parse_date_hungarian) # Drake and licensed under the Python license. Removed all range checking # for month, day, hour, minute, and second, since mktime will normalize # these later -# Modified to also support MSSQL-style datetimes as defined at: -# http://msdn.microsoft.com/en-us/library/ms186724.aspx -# (which basically means allowing a space as a date/time/timezone separator) def _parse_date_w3dtf(dateString): def __extract_date(m): year = int(m.group('year')) @@ -3373,7 +2129,7 @@ def _parse_date_w3dtf(dateString): day = 31 elif jday < julian: if day + diff < 28: - day = day + diff + day = day + diff else: month = month + 1 return year, month, day @@ -3427,558 +2183,414 @@ def _parse_date_w3dtf(dateString): __date_re = ('(?P\d\d\d\d)' '(?:(?P-|)' - '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?' - '|(?P\d\d\d)))?') - __tzd_re = ' ?(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)?' + '(?:(?P\d\d\d)' + '|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') + __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' + __tzd_rx = re.compile(__tzd_re) __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?' + '(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' + __tzd_re) - __datetime_re = '%s(?:[T ]%s)?' % (__date_re, __time_re) + __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) __datetime_rx = re.compile(__datetime_re) m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): - return + if (m is None) or (m.group() != dateString): return gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: - return + if gmt[0] == 0: return return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) registerDateHandler(_parse_date_w3dtf) -# Define the strings used by the RFC822 datetime parser -_rfc822_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', - 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] -_rfc822_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] - -# Only the first three letters of the month name matter -_rfc822_month = "(?P%s)(?:[a-z]*,?)" % ('|'.join(_rfc822_months)) -# The year may be 2 or 4 digits; capture the century if it exists -_rfc822_year = "(?P(?:\d{2})?\d{2})" -_rfc822_day = "(?P *\d{1,2})" -_rfc822_date = "%s %s %s" % (_rfc822_day, _rfc822_month, _rfc822_year) - -_rfc822_hour = "(?P\d{2}):(?P\d{2})(?::(?P\d{2}))?" -_rfc822_tz = "(?Put|gmt(?:[+-]\d{2}:\d{2})?|[aecmp][sd]?t|[zamny]|[+-]\d{4})" -_rfc822_tznames = { - 'ut': 0, 'gmt': 0, 'z': 0, - 'adt': -3, 'ast': -4, 'at': -4, - 'edt': -4, 'est': -5, 'et': -5, - 'cdt': -5, 'cst': -6, 'ct': -6, - 'mdt': -6, 'mst': -7, 'mt': -7, - 'pdt': -7, 'pst': -8, 'pt': -8, - 'a': -1, 'n': 1, - 'm': -12, 'y': 12, - } -# The timezone may be prefixed by 'Etc/' -_rfc822_time = "%s (?:etc/)?%s" % (_rfc822_hour, _rfc822_tz) - -_rfc822_dayname = "(?P%s)" % ('|'.join(_rfc822_daynames)) -_rfc822_match = re.compile( - "(?:%s, )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date, _rfc822_time) -).match - -def _parse_date_group_rfc822(m): - # Calculate a date and timestamp - for k in ('year', 'day', 'hour', 'minute', 'second'): - m[k] = int(m[k]) - m['month'] = _rfc822_months.index(m['month']) + 1 - # If the year is 2 digits, assume everything in the 90's is the 1990's - if m['year'] < 100: - m['year'] += (1900, 2000)[m['year'] < 90] - stamp = datetime.datetime(*[m[i] for i in - ('year', 'month', 'day', 'hour', 'minute', 'second')]) - - # Use the timezone information to calculate the difference between - # the given date and timestamp and Universal Coordinated Time - tzhour = 0 - tzmin = 0 - if m['tz'] and m['tz'].startswith('gmt'): - # Handle GMT and GMT+hh:mm timezone syntax (the trailing - # timezone info will be handled by the next `if` block) - m['tz'] = ''.join(m['tz'][3:].split(':')) or 'gmt' - if not m['tz']: - pass - elif m['tz'].startswith('+'): - tzhour = int(m['tz'][1:3]) - tzmin = int(m['tz'][3:]) - elif m['tz'].startswith('-'): - tzhour = int(m['tz'][1:3]) * -1 - tzmin = int(m['tz'][3:]) * -1 - else: - tzhour = _rfc822_tznames[m['tz']] - delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) - - # Return the date and timestamp in UTC - return (stamp - delta).utctimetuple() - -def _parse_date_rfc822(dt): - """Parse RFC 822 dates and times, with one minor - difference: years may be 4DIGIT or 2DIGIT. - http://tools.ietf.org/html/rfc822#section-5""" - try: - m = _rfc822_match(dt.lower()).groupdict(0) - except AttributeError: - return None - - return _parse_date_group_rfc822(m) -registerDateHandler(_parse_date_rfc822) - -def _parse_date_rfc822_grubby(dt): - """Parse date format similar to RFC 822, but - the comma after the dayname is optional and - month/day are inverted""" - _rfc822_date_grubby = "%s %s %s" % (_rfc822_month, _rfc822_day, _rfc822_year) - _rfc822_match_grubby = re.compile( - "(?:%s[,]? )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date_grubby, _rfc822_time) - ).match - - try: - m = _rfc822_match_grubby(dt.lower()).groupdict(0) - except AttributeError: - return None - - return _parse_date_group_rfc822(m) -registerDateHandler(_parse_date_rfc822_grubby) - -def _parse_date_asctime(dt): - """Parse asctime-style dates""" - dayname, month, day, remainder = dt.split(None, 3) - # Convert month and day into zero-padded integers - month = '%02i ' % (_rfc822_months.index(month.lower()) + 1) - day = '%02i ' % (int(day),) - dt = month + day + remainder - return time.strptime(dt, '%m %d %H:%M:%S %Y')[:-1] + (0, ) -registerDateHandler(_parse_date_asctime) - -def _parse_date_perforce(aDateString): - """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" - # Fri, 2006/09/15 08:19:53 EDT - _my_date_pattern = re.compile( \ - r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') - - m = _my_date_pattern.search(aDateString) - if m is None: - return None - dow, year, month, day, hour, minute, second, tz = m.groups() - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) +def _parse_date_rfc822(dateString): + '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' + data = dateString.split() + if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: + del data[0] + if len(data) == 4: + s = data[3] + i = s.find('+') + if i > 0: + data[3:] = [s[:i], s[i+1:]] + else: + data.append('') + dateString = " ".join(data) + if len(data) < 5: + dateString += ' 00:00:00 GMT' tm = rfc822.parsedate_tz(dateString) if tm: return time.gmtime(rfc822.mktime_tz(tm)) -registerDateHandler(_parse_date_perforce) +# rfc822.py defines several time zones, but we define some extra ones. +# 'ET' is equivalent to 'EST', etc. +_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} +rfc822._timezones.update(_additional_timezones) +registerDateHandler(_parse_date_rfc822) def _parse_date(dateString): '''Parses a variety of date formats into a 9-tuple in GMT''' - if not dateString: - return None for handler in _date_handlers: try: date9tuple = handler(dateString) - except (KeyError, OverflowError, ValueError): - continue - if not date9tuple: - continue - if len(date9tuple) != 9: - continue - return date9tuple + if not date9tuple: continue + if len(date9tuple) != 9: + if _debug: sys.stderr.write('date handler function must return 9-tuple\n') + raise ValueError + map(int, date9tuple) + return date9tuple + except Exception, e: + if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) + pass return None -# Each marker represents some of the characters of the opening XML -# processing instruction (' -RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>') - -# Capture the value of the XML processing instruction's encoding attribute. -# Example: -RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')) - -def convert_to_utf8(http_headers, data): - '''Detect and convert the character encoding to UTF-8. +def _getCharacterEncoding(http_headers, xml_data): + '''Get the character encoding of the XML document http_headers is a dictionary - data is a raw string (not Unicode)''' + xml_data is a raw string (not Unicode) + + This is so much trickier than it sounds, it's not even funny. + According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type + is application/xml, application/*+xml, + application/xml-external-parsed-entity, or application/xml-dtd, + the encoding given in the charset parameter of the HTTP Content-Type + takes precedence over the encoding given in the XML prefix within the + document, and defaults to 'utf-8' if neither are specified. But, if + the HTTP Content-Type is text/xml, text/*+xml, or + text/xml-external-parsed-entity, the encoding given in the XML prefix + within the document is ALWAYS IGNORED and only the encoding given in + the charset parameter of the HTTP Content-Type header should be + respected, and it defaults to 'us-ascii' if not specified. - # This is so much trickier than it sounds, it's not even funny. - # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type - # is application/xml, application/*+xml, - # application/xml-external-parsed-entity, or application/xml-dtd, - # the encoding given in the charset parameter of the HTTP Content-Type - # takes precedence over the encoding given in the XML prefix within the - # document, and defaults to 'utf-8' if neither are specified. But, if - # the HTTP Content-Type is text/xml, text/*+xml, or - # text/xml-external-parsed-entity, the encoding given in the XML prefix - # within the document is ALWAYS IGNORED and only the encoding given in - # the charset parameter of the HTTP Content-Type header should be - # respected, and it defaults to 'us-ascii' if not specified. + Furthermore, discussion on the atom-syntax mailing list with the + author of RFC 3023 leads me to the conclusion that any document + served with a Content-Type of text/* and no charset parameter + must be treated as us-ascii. (We now do this.) And also that it + must always be flagged as non-well-formed. (We now do this too.) + + If Content-Type is unspecified (input was local file or non-HTTP source) + or unrecognized (server just got it totally wrong), then go by the + encoding given in the XML prefix of the document and default to + 'iso-8859-1' as per the HTTP specification (RFC 2616). + + Then, assuming we didn't find a character encoding in the HTTP headers + (and the HTTP Content-type allowed us to look in the body), we need + to sniff the first few bytes of the XML data and try to determine + whether the encoding is ASCII-compatible. Section F of the XML + specification shows the way here: + http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - # Furthermore, discussion on the atom-syntax mailing list with the - # author of RFC 3023 leads me to the conclusion that any document - # served with a Content-Type of text/* and no charset parameter - # must be treated as us-ascii. (We now do this.) And also that it - # must always be flagged as non-well-formed. (We now do this too.) + If the sniffed encoding is not ASCII-compatible, we need to make it + ASCII compatible so that we can sniff further into the XML declaration + to find the encoding attribute, which will tell us the true encoding. - # If Content-Type is unspecified (input was local file or non-HTTP source) - # or unrecognized (server just got it totally wrong), then go by the - # encoding given in the XML prefix of the document and default to - # 'iso-8859-1' as per the HTTP specification (RFC 2616). + Of course, none of this guarantees that we will be able to parse the + feed in the declared character encoding (assuming it was declared + correctly, which many are not). CJKCodecs and iconv_codec help a lot; + you should definitely install them if you can. + http://cjkpython.i18n.org/ + ''' - # Then, assuming we didn't find a character encoding in the HTTP headers - # (and the HTTP Content-type allowed us to look in the body), we need - # to sniff the first few bytes of the XML data and try to determine - # whether the encoding is ASCII-compatible. Section F of the XML - # specification shows the way here: - # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + def _parseHTTPContentType(content_type): + '''takes HTTP Content-Type header and returns (content type, charset) - # If the sniffed encoding is not ASCII-compatible, we need to make it - # ASCII compatible so that we can sniff further into the XML declaration - # to find the encoding attribute, which will tell us the true encoding. + If no charset is specified, returns (content type, '') + If no content type is specified, returns ('', '') + Both return parameters are guaranteed to be lowercase strings + ''' + content_type = content_type or '' + content_type, params = cgi.parse_header(content_type) + return content_type, params.get('charset', '').replace("'", '') - # Of course, none of this guarantees that we will be able to parse the - # feed in the declared character encoding (assuming it was declared - # correctly, which many are not). iconv_codec can help a lot; - # you should definitely install it if you can. - # http://cjkpython.i18n.org/ - - bom_encoding = '' + sniffed_xml_encoding = '' xml_encoding = '' - rfc3023_encoding = '' - - # Look at the first few bytes of the document to guess what - # its encoding may be. We only need to decode enough of the - # document that we can use an ASCII-compatible regular - # expression to search for an XML encoding declaration. - # The heuristic follows the XML specification, section F: + true_encoding = '' + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) + # Must sniff for non-ASCII-compatible character encodings before + # searching for XML declaration. This heuristic is defined in + # section F of the XML specification: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - # Check for BOMs first. - if data[:4] == codecs.BOM_UTF32_BE: - bom_encoding = 'utf-32be' - data = data[4:] - elif data[:4] == codecs.BOM_UTF32_LE: - bom_encoding = 'utf-32le' - data = data[4:] - elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: - bom_encoding = 'utf-16be' - data = data[2:] - elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: - bom_encoding = 'utf-16le' - data = data[2:] - elif data[:3] == codecs.BOM_UTF8: - bom_encoding = 'utf-8' - data = data[3:] - # Check for the characters '= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + # ASCII-compatible + pass + xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) + except: xml_encoding_match = None - else: - xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) - if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() - # Normalize the xml_encoding if necessary. - if bom_encoding and (xml_encoding in ( - 'u16', 'utf-16', 'utf16', 'utf_16', - 'u32', 'utf-32', 'utf32', 'utf_32', - 'iso-10646-ucs-2', 'iso-10646-ucs-4', - 'csucs4', 'csunicode', 'ucs-2', 'ucs-4' - )): - xml_encoding = bom_encoding - - # Find the HTTP Content-Type and, hopefully, a character - # encoding provided by the server. The Content-Type is used - # to choose the "correct" encoding among the BOM encoding, - # XML declaration encoding, and HTTP encoding, following the - # heuristic defined in RFC 3023. - http_content_type = http_headers.get('content-type') or '' - http_content_type, params = cgi.parse_header(http_content_type) - http_encoding = params.get('charset', '').replace("'", "") - if not isinstance(http_encoding, str): - http_encoding = http_encoding.decode('utf-8', 'ignore') - + xml_encoding = xml_encoding_match.groups()[0].lower() + if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', - 'application/xml-external-parsed-entity') + application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') text_content_types = ('text/xml', 'text/xml-external-parsed-entity') if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and - http_content_type.endswith('+xml')): + (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): acceptable_content_type = 1 - rfc3023_encoding = http_encoding or xml_encoding or 'utf-8' + true_encoding = http_encoding or xml_encoding or 'utf-8' elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/') and - http_content_type.endswith('+xml')): + (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): acceptable_content_type = 1 - rfc3023_encoding = http_encoding or 'us-ascii' + true_encoding = http_encoding or 'us-ascii' elif http_content_type.startswith('text/'): - rfc3023_encoding = http_encoding or 'us-ascii' - elif http_headers and 'content-type' not in http_headers: - rfc3023_encoding = xml_encoding or 'iso-8859-1' + true_encoding = http_encoding or 'us-ascii' + elif http_headers and (not http_headers.has_key('content-type')): + true_encoding = xml_encoding or 'iso-8859-1' else: - rfc3023_encoding = xml_encoding or 'utf-8' - # gb18030 is a superset of gb2312, so always replace gb2312 - # with gb18030 for greater compatibility. - if rfc3023_encoding.lower() == 'gb2312': - rfc3023_encoding = 'gb18030' - if xml_encoding.lower() == 'gb2312': - xml_encoding = 'gb18030' + true_encoding = xml_encoding or 'utf-8' + return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type + +def _toUTF8(data, encoding): + '''Changes an XML data stream on the fly to specify a new encoding + + data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already + encoding is a string recognized by encodings.aliases + ''' + if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-16be': + sys.stderr.write('trying utf-16be instead\n') + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-16le': + sys.stderr.write('trying utf-16le instead\n') + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-8': + sys.stderr.write('trying utf-8 instead\n') + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-32be': + sys.stderr.write('trying utf-32be instead\n') + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + if _debug: + sys.stderr.write('stripping BOM\n') + if encoding != 'utf-32le': + sys.stderr.write('trying utf-32le instead\n') + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) + declmatch = re.compile('^<\?xml[^>]*?>') + newdecl = '''''' + if declmatch.search(newdata): + newdata = declmatch.sub(newdecl, newdata) + else: + newdata = newdecl + u'\n' + newdata + return newdata.encode('utf-8') + +def _stripDoctype(data): + '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) + + rss_version may be 'rss091n' or None + stripped_data is the same XML document, minus the DOCTYPE + ''' + entity_pattern = re.compile(r']*?)>', re.MULTILINE) + data = entity_pattern.sub('', data) + doctype_pattern = re.compile(r']*?)>', re.MULTILINE) + doctype_results = doctype_pattern.findall(data) + doctype = doctype_results and doctype_results[0] or '' + if doctype.lower().count('netscape'): + version = 'rss091n' + else: + version = None + data = doctype_pattern.sub('', data) + return version, data + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): + '''Parse a feed from a URL, file, stream, or string''' + result = FeedParserDict() + result['feed'] = FeedParserDict() + result['entries'] = [] + if _XML_AVAILABLE: + result['bozo'] = 0 + if type(handlers) == types.InstanceType: + handlers = [handlers] + try: + f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) + data = f.read() + except Exception, e: + result['bozo'] = 1 + result['bozo_exception'] = e + data = '' + f = None + + # if feed is gzip-compressed, decompress it + if f and data and hasattr(f, 'headers'): + if gzip and f.headers.get('content-encoding', '') == 'gzip': + try: + data = gzip.GzipFile(fileobj=_StringIO(data)).read() + except Exception, e: + # Some feeds claim to be gzipped but they're not, so + # we get garbage. Ideally, we should re-request the + # feed without the 'Accept-encoding: gzip' header, + # but we don't. + result['bozo'] = 1 + result['bozo_exception'] = e + data = '' + elif zlib and f.headers.get('content-encoding', '') == 'deflate': + try: + data = zlib.decompress(data, -zlib.MAX_WBITS) + except Exception, e: + result['bozo'] = 1 + result['bozo_exception'] = e + data = '' + + # save HTTP headers + if hasattr(f, 'info'): + info = f.info() + result['etag'] = info.getheader('ETag') + last_modified = info.getheader('Last-Modified') + if last_modified: + result['modified'] = _parse_date(last_modified) + if hasattr(f, 'url'): + result['href'] = f.url + result['status'] = 200 + if hasattr(f, 'status'): + result['status'] = f.status + if hasattr(f, 'headers'): + result['headers'] = f.headers.dict + if hasattr(f, 'close'): + f.close() # there are four encodings to keep track of: # - http_encoding is the encoding declared in the Content-Type HTTP header # - xml_encoding is the encoding declared in the ''' - if RE_XML_DECLARATION.search(data): - data = RE_XML_DECLARATION.sub(new_declaration, data) - else: - data = new_declaration + '\n' + data - data = data.encode('utf-8') - break - # if still no luck, give up - if not known_encoding: - error = CharacterEncodingUnknown( - 'document encoding unknown, I tried ' + - '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % - (rfc3023_encoding, xml_encoding)) - rfc3023_encoding = '' - elif proposed_encoding != rfc3023_encoding: - error = CharacterEncodingOverride( - 'document declared as %s, but parsed as %s' % - (rfc3023_encoding, proposed_encoding)) - rfc3023_encoding = proposed_encoding - - return data, rfc3023_encoding, error - -# Match XML entity declarations. -# Example: -RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) - -# Match XML DOCTYPE declarations. -# Example: -RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) - -# Match safe entity declarations. -# This will allow hexadecimal character references through, -# as well as text, but not arbitrary nested entities. -# Example: cubed "³" -# Example: copyright "(C)" -# Forbidden: explode1 "&explode2;&explode2;" -RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) - -def replace_doctype(data): - '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) - - rss_version may be 'rss091n' or None - stripped_data is the same XML document with a replaced DOCTYPE - ''' - - # Divide the document into two groups by finding the location - # of the first element that doesn't begin with '\n\n]>') - data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data - - # Precompute the safe entities for the loose parser. - safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) - for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) - return version, data, safe_entities - -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): - '''Parse a feed from a URL, file, stream, or string. - - request_headers, if given, is a dict from http header name to value to add - to the request; this overrides internally generated values. - ''' - - if handlers is None: - handlers = [] - if request_headers is None: - request_headers = {} - if response_headers is None: - response_headers = {} - - result = FeedParserDict() - result['feed'] = FeedParserDict() - result['entries'] = [] - result['bozo'] = 0 - if not isinstance(handlers, list): - handlers = [handlers] - try: - f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) - data = f.read() - except Exception as e: + bozo_message = 'no Content-type specified' result['bozo'] = 1 - result['bozo_exception'] = e - data = None - f = None + result['bozo_exception'] = NonXMLContentType(bozo_message) + + result['version'], data = _stripDoctype(data) - if hasattr(f, 'headers'): - result['headers'] = dict(f.headers) - # overwrite existing headers using response_headers - if 'headers' in result: - result['headers'].update(response_headers) - elif response_headers: - result['headers'] = copy.deepcopy(response_headers) + baseuri = http_headers.get('content-location', result.get('href')) + baselang = http_headers.get('content-language', None) - # lowercase all of the HTTP headers for comparisons per RFC 2616 - if 'headers' in result: - http_headers = dict((k.lower(), v) for k, v in list(result['headers'].items())) - else: - http_headers = {} - - # if feed is gzip-compressed, decompress it - if f and data and http_headers: - if gzip and 'gzip' in http_headers.get('content-encoding', ''): - try: - data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except (IOError, struct.error) as e: - # IOError can occur if the gzip header is bad. - # struct.error can occur if the data is damaged. - result['bozo'] = 1 - result['bozo_exception'] = e - if isinstance(e, struct.error): - # A gzip header was found but the data is corrupt. - # Ideally, we should re-request the feed without the - # 'Accept-encoding: gzip' header, but we don't. - data = None - elif zlib and 'deflate' in http_headers.get('content-encoding', ''): - try: - data = zlib.decompress(data) - except zlib.error as e: - try: - # The data may have no headers and no checksum. - data = zlib.decompress(data, -15) - except zlib.error as e: - result['bozo'] = 1 - result['bozo_exception'] = e - - # save HTTP headers - if http_headers: - if 'etag' in http_headers: - etag = http_headers.get('etag', '') - if not isinstance(etag, str): - etag = etag.decode('utf-8', 'ignore') - if etag: - result['etag'] = etag - if 'last-modified' in http_headers: - modified = http_headers.get('last-modified', '') - if modified: - result['modified'] = modified - result['modified_parsed'] = _parse_date(modified) - if hasattr(f, 'url'): - if not isinstance(f.url, str): - result['href'] = f.url.decode('utf-8', 'ignore') - else: - result['href'] = f.url - result['status'] = 200 - if hasattr(f, 'status'): - result['status'] = f.status - if hasattr(f, 'close'): - f.close() - - if data is None: - return result - - # Stop processing if the server sent HTTP 304 Not Modified. - if getattr(f, 'code', 0) == 304: + # if server sent 304, we're done + if result.get('status', 0) == 304: result['version'] = '' result['debug_message'] = 'The feed has not changed since you last checked, ' + \ 'so the server sent no data. This is a feature, not a bug!' return result - data, result['encoding'], error = convert_to_utf8(http_headers, data) - use_strict_parser = result['encoding'] and True or False - if error is not None: + # if there was a problem downloading, we're done + if not data: + return result + + # determine character encoding + use_strict_parser = 0 + known_encoding = 0 + tried_encodings = [] + # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM + for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): + if not proposed_encoding: continue + if proposed_encoding in tried_encodings: continue + tried_encodings.append(proposed_encoding) + try: + data = _toUTF8(data, proposed_encoding) + known_encoding = use_strict_parser = 1 + break + except: + pass + # if no luck and we have auto-detection library, try that + if (not known_encoding) and chardet: + try: + proposed_encoding = chardet.detect(data)['encoding'] + if proposed_encoding and (proposed_encoding not in tried_encodings): + tried_encodings.append(proposed_encoding) + data = _toUTF8(data, proposed_encoding) + known_encoding = use_strict_parser = 1 + except: + pass + # if still no luck and we haven't tried utf-8 yet, try that + if (not known_encoding) and ('utf-8' not in tried_encodings): + try: + proposed_encoding = 'utf-8' + tried_encodings.append(proposed_encoding) + data = _toUTF8(data, proposed_encoding) + known_encoding = use_strict_parser = 1 + except: + pass + # if still no luck and we haven't tried windows-1252 yet, try that + if (not known_encoding) and ('windows-1252' not in tried_encodings): + try: + proposed_encoding = 'windows-1252' + tried_encodings.append(proposed_encoding) + data = _toUTF8(data, proposed_encoding) + known_encoding = use_strict_parser = 1 + except: + pass + # if still no luck, give up + if not known_encoding: result['bozo'] = 1 - result['bozo_exception'] = error - - result['version'], data, entities = replace_doctype(data) - - # Ensure that baseuri is an absolute URI using an acceptable URI scheme. - contentloc = http_headers.get('content-location', '') - href = result.get('href', '') - baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href - - baselang = http_headers.get('content-language', None) - if not isinstance(baselang, str) and baselang is not None: - baselang = baselang.decode('utf-8', 'ignore') + result['bozo_exception'] = CharacterEncodingUnknown( \ + 'document encoding unknown, I tried ' + \ + '%s, %s, utf-8, and windows-1252 but nothing worked' % \ + (result['encoding'], xml_encoding)) + result['encoding'] = '' + elif proposed_encoding != result['encoding']: + result['bozo'] = 1 + result['bozo_exception'] = CharacterEncodingOverride( \ + 'documented declared as %s, but parsed as %s' % \ + (result['encoding'], proposed_encoding)) + result['encoding'] = proposed_encoding if not _XML_AVAILABLE: use_strict_parser = 0 @@ -3987,26 +2599,260 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) - try: - # disable downloading external doctype references, if possible - saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) - except xml.sax.SAXNotSupportedException: - pass saxparser.setContentHandler(feedparser) saxparser.setErrorHandler(feedparser) source = xml.sax.xmlreader.InputSource() source.setByteStream(_StringIO(data)) + if hasattr(saxparser, '_ns_stack'): + # work around bug in built-in SAX parser (doesn't recognize xml: namespace) + # PyXML doesn't have this problem, and it doesn't have _ns_stack either + saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) try: saxparser.parse(source) - except xml.sax.SAXException as e: + except Exception, e: + if _debug: + import traceback + traceback.print_stack() + traceback.print_exc() + sys.stderr.write('xml parsing failed\n') result['bozo'] = 1 result['bozo_exception'] = feedparser.exc or e use_strict_parser = 0 - if not use_strict_parser and _SGML_AVAILABLE: - feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities) - feedparser.feed(data.decode('utf-8', 'replace')) + if not use_strict_parser: + feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '') + feedparser.feed(data) result['feed'] = feedparser.feeddata result['entries'] = feedparser.entries result['version'] = result['version'] or feedparser.version result['namespaces'] = feedparser.namespacesInUse return result + +if __name__ == '__main__': + if not sys.argv[1:]: + print __doc__ + sys.exit(0) + else: + urls = sys.argv[1:] + zopeCompatibilityHack() + from pprint import pprint + for url in urls: + print url + print + result = parse(url) + pprint(result) + print + +#REVISION HISTORY +#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, +# added Simon Fell's test suite +#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections +#2.0 - 10/19/2002 +# JD - use inchannel to watch out for image and textinput elements which can +# also contain title, link, and description elements +# JD - check for isPermaLink='false' attribute on guid elements +# JD - replaced openAnything with open_resource supporting ETag and +# If-Modified-Since request headers +# JD - parse now accepts etag, modified, agent, and referrer optional +# arguments +# JD - modified parse to return a dictionary instead of a tuple so that any +# etag or modified information can be returned and cached by the caller +#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything +# because of etag/modified, return the old etag/modified to the caller to +# indicate why nothing is being returned +#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its +# useless. Fixes the problem JD was addressing by adding it. +#2.1 - 11/14/2002 - MAP - added gzip support +#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent. +# start_admingeneratoragent is an example of how to handle elements with +# only attributes, no content. +#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify); +# also, make sure we send the User-Agent even if urllib2 isn't available. +# Match any variation of backend.userland.com/rss namespace. +#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is. +#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's +# snapshot of July 1 ; changed +# project name +#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); +# removed unnecessary urllib code -- urllib2 should always be available anyway; +# return actual url, status, and full HTTP headers (as result['url'], +# result['status'], and result['headers']) if parsing a remote feed over HTTP -- +# this should pass all the HTTP tests at ; +# added the latest namespace-of-the-week for RSS 2.0 +#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom +# User-Agent (otherwise urllib2 sends two, which confuses some servers) +#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for +# inline and as used in some RSS 2.0 feeds +#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or +# textInput, and also to return the character encoding (if specified) +#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking +# nested divs within content (JohnD); fixed missing sys import (JohanS); +# fixed regular expression to capture XML character encoding (Andrei); +# added support for Atom 0.3-style links; fixed bug with textInput tracking; +# added support for cloud (MartijnP); added support for multiple +# category/dc:subject (MartijnP); normalize content model: 'description' gets +# description (which can come from description, summary, or full content if no +# description), 'content' gets dict of base/language/type/value (which can come +# from content:encoded, xhtml:body, content, or fullitem); +# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang +# tracking; fixed bug tracking unknown tags; fixed bug tracking content when +# element is not in default namespace (like Pocketsoap feed); +# resolve relative URLs in link, guid, docs, url, comments, wfw:comment, +# wfw:commentRSS; resolve relative URLs within embedded HTML markup in +# description, xhtml:body, content, content:encoded, title, subtitle, +# summary, info, tagline, and copyright; added support for pingback and +# trackback namespaces +#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback +# namespaces, as opposed to 2.6 when I said I did but didn't really; +# sanitize HTML markup within some elements; added mxTidy support (if +# installed) to tidy HTML markup within some elements; fixed indentation +# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available +# (FazalM); universal date parsing and normalization (FazalM): 'created', modified', +# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', +# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' +# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa +#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory +# leak not closing url opener (JohnD); added dc:publisher support (MarekK); +# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) +#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in +# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); +# fixed relative URI processing for guid (skadz); added ICBM support; added +# base64 support +#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many +# blogspot.com sites); added _debug variable +#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing +#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); +# added several new supported namespaces; fixed bug tracking naked markup in +# description; added support for enclosure; added support for source; re-added +# support for cloud which got dropped somehow; added support for expirationDate +#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking +# xml:base URI, one for documents that don't define one explicitly and one for +# documents that define an outer and an inner xml:base that goes out of scope +# before the end of the document +#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level +#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] +# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; +# added support for creativeCommons:license and cc:license; added support for +# full Atom content model in title, tagline, info, copyright, summary; fixed bug +# with gzip encoding (not always telling server we support it when we do) +#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail +# (dictionary of 'name', 'url', 'email'); map author to author_detail if author +# contains name + email address +#3.0b8 - 1/28/2004 - MAP - added support for contributor +#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added +# support for summary +#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from +# xml.util.iso8601 +#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain +# dangerous markup; fiddled with decodeEntities (not right); liberalized +# date parsing even further +#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); +# added support to Atom 0.2 subtitle; added support for Atom content model +# in copyright; better sanitizing of dangerous HTML elements with end tags +# (script, frameset) +#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, +# etc.) in embedded markup, in either HTML or XHTML form (
,
,
) +#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under +# Python 2.1 +#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; +# fixed bug capturing author and contributor URL; fixed bug resolving relative +# links in author and contributor URL; fixed bug resolvin relative links in +# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's +# namespace tests, and included them permanently in the test suite with his +# permission; fixed namespace handling under Python 2.1 +#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) +#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 +#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); +# use libxml2 (if available) +#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author +# name was in parentheses; removed ultra-problematic mxTidy support; patch to +# workaround crash in PyXML/expat when encountering invalid entities +# (MarkMoraes); support for textinput/textInput +#3.0b20 - 4/7/2004 - MAP - added CDF support +#3.0b21 - 4/14/2004 - MAP - added Hot RSS support +#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in +# results dict; changed results dict to allow getting values with results.key +# as well as results[key]; work around embedded illformed HTML with half +# a DOCTYPE; work around malformed Content-Type header; if character encoding +# is wrong, try several common ones before falling back to regexes (if this +# works, bozo_exception is set to CharacterEncodingOverride); fixed character +# encoding issues in BaseHTMLProcessor by tracking encoding and converting +# from Unicode to raw strings before feeding data to sgmllib.SGMLParser; +# convert each value in results to Unicode (if possible), even if using +# regex-based parsing +#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain +# high-bit characters in attributes in embedded HTML in description (thanks +# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in +# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking +# about a mapped key +#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and +# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could +# cause the same encoding to be tried twice (even if it failed the first time); +# fixed DOCTYPE stripping when DOCTYPE contained entity declarations; +# better textinput and image tracking in illformed RSS 1.0 feeds +#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed +# my blink tag tests +#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that +# failed to parse utf-16 encoded feeds; made source into a FeedParserDict; +# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; +# added support for image; refactored parse() fallback logic to try other +# encodings if SAX parsing fails (previously it would only try other encodings +# if re-encoding failed); remove unichr madness in normalize_attrs now that +# we're properly tracking encoding in and out of BaseHTMLProcessor; set +# feed.language from root-level xml:lang; set entry.id from rdf:about; +# send Accept header +#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between +# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are +# windows-1252); fixed regression that could cause the same encoding to be +# tried twice (even if it failed the first time) +#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; +# recover from malformed content-type header parameter with no equals sign +# ('text/xml; charset:iso-8859-1') +#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities +# to Unicode equivalents in illformed feeds (aaronsw); added and +# passed tests for converting character entities to Unicode equivalents +# in illformed feeds (aaronsw); test for valid parsers when setting +# XML_AVAILABLE; make version and encoding available when server returns +# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like +# digest auth or proxy support); add code to parse username/password +# out of url and send as basic authentication; expose downloading-related +# exceptions in bozo_exception (aaronsw); added __contains__ method to +# FeedParserDict (aaronsw); added publisher_detail (aaronsw) +#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always +# convert feed to UTF-8 before passing to XML parser; completely revamped +# logic for determining character encoding and attempting XML parsing +# (much faster); increased default timeout to 20 seconds; test for presence +# of Location header on redirects; added tests for many alternate character +# encodings; support various EBCDIC encodings; support UTF-16BE and +# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support +# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no +# XML parsers are available; added support for 'Content-encoding: deflate'; +# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules +# are available +#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure +# problem tracking xml:base and xml:lang if element declares it, child +# doesn't, first grandchild redeclares it, and second grandchild doesn't; +# refactored date parsing; defined public registerDateHandler so callers +# can add support for additional date formats at runtime; added support +# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added +# zopeCompatibilityHack() which turns FeedParserDict into a regular +# dictionary, required for Zope compatibility, and also makes command- +# line debugging easier because pprint module formats real dictionaries +# better than dictionary-like objects; added NonXMLContentType exception, +# which is stored in bozo_exception when a feed is served with a non-XML +# media type such as 'text/plain'; respect Content-Language as default +# language if not xml:lang is present; cloud dict is now FeedParserDict; +# generator dict is now FeedParserDict; better tracking of xml:lang, +# including support for xml:lang='' to unset the current language; +# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default +# namespace; don't overwrite final status on redirects (scenarios: +# redirecting to a URL that returns 304, redirecting to a URL that +# redirects to another URL with a different type of redirect); add +# support for HTTP 303 redirects +#4.0 - MAP - support for relative URIs in xml:base attribute; fixed +# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; +# support for Atom 1.0; support for iTunes extensions; new 'tags' for +# categories/keywords/etc. as array of dict +# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 +# terminology; parse RFC 822-style dates with no time; lots of other +# bug fixes +#4.1 - MAP - removed socket timeout; added support for chardet library diff --git a/libs/magic.py b/libs/magic.py index 10685ac..6c30a0c 100644 --- a/libs/magic.py +++ b/libs/magic.py @@ -1,4 +1,7 @@ """ +Adam Hupp (adam@hupp.org) +http://github.com/ahupp/python-magic + magic is a wrapper around the libmagic file identification library. See README for more information. @@ -17,12 +20,9 @@ Usage: """ -import sys -import glob import os.path import ctypes import ctypes.util -import threading from ctypes import c_char_p, c_int, c_size_t, c_void_p @@ -34,112 +34,74 @@ class Magic: """ - def __init__(self, mime=False, magic_file=None, mime_encoding=False, - keep_going=False): + def __init__(self, mime=False, magic_file=None, mime_encoding=False): """ Create a new libmagic wrapper. mime - if True, mimetypes are returned instead of textual descriptions mime_encoding - if True, codec is returned magic_file - use a mime database other than the system default - keep_going - don't stop at the first match, keep going - """ - self.flags = MAGIC_NONE - if mime: - self.flags |= MAGIC_MIME - elif mime_encoding: - self.flags |= MAGIC_MIME_ENCODING - if keep_going: - self.flags |= MAGIC_CONTINUE - self.cookie = magic_open(self.flags) + """ + flags = MAGIC_NONE + if mime: + flags |= MAGIC_MIME + elif mime_encoding: + flags |= MAGIC_MIME_ENCODING + + self.cookie = magic_open(flags) magic_load(self.cookie, magic_file) - self.thread = threading.currentThread() def from_buffer(self, buf): """ Identify the contents of `buf` """ - self._thread_check() - try: - return magic_buffer(self.cookie, buf) - except MagicException as e: - return self._handle509Bug(e) + return magic_buffer(self.cookie, buf) def from_file(self, filename): """ Identify the contents of file `filename` raises IOError if the file does not exist """ - self._thread_check() + if not os.path.exists(filename): raise IOError("File does not exist: " + filename) - try: - return magic_file(self.cookie, filename) - except MagicException as e: - return self._handle509Bug(e) - def _handle509Bug(self, e): - # libmagic 5.09 has a bug where it might mail to identify the - # mimetype of a file and returns null from magic_file (and - # likely _buffer), but also does not return an error message. - if e.message is None and (self.flags & MAGIC_MIME): - return "application/octet-stream" - - def _thread_check(self): - if self.thread != threading.currentThread(): - raise Exception('attempting to use libmagic on multiple threads will ' - 'end in SEGV. Prefer to use the module functions ' - 'from_file or from_buffer, or carefully manage direct ' - 'use of the Magic class') + return magic_file(self.cookie, filename) def __del__(self): - # no _thread_check here because there can be no other - # references to this object at this point. - - # during shutdown magic_close may have been cleared already so - # make sure it exists before using it. - - # the self.cookie check should be unnessary and was an - # incorrect fix for a threading problem, however I'm leaving - # it in because it's harmless and I'm slightly afraid to - # remove it. - if self.cookie and magic_close: + if self.cookie: magic_close(self.cookie) self.cookie = None +_magic_mime = None +_magic = None -instances = threading.local() +def _get_magic_mime(): + global _magic_mime + if not _magic_mime: + _magic_mime = Magic(mime=True) + return _magic_mime + +def _get_magic(): + global _magic + if not _magic: + _magic = Magic() + return _magic def _get_magic_type(mime): - i = instances.__dict__.get(mime) - if i is None: - i = instances.__dict__[mime] = Magic(mime=mime) - return i + if mime: + return _get_magic_mime() + else: + return _get_magic() def from_file(filename, mime=False): - """" - Accepts a filename and returns the detected filetype. Return - value is the mimetype if mime=True, otherwise a human readable - name. - - >>> magic.from_file("testdata/test.pdf", mime=True) - 'application/pdf' - """ m = _get_magic_type(mime) return m.from_file(filename) def from_buffer(buffer, mime=False): - """ - Accepts a binary string and returns the detected filetype. Return - value is the mimetype if mime=True, otherwise a human readable - name. - - >>> magic.from_buffer(open("testdata/test.pdf").read(1024)) - 'PDF document, version 1.2' - """ m = _get_magic_type(mime) return m.from_buffer(buffer) @@ -148,22 +110,19 @@ def from_buffer(buffer, mime=False): libmagic = None # Let's try to find magic or magic1 -dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') or ctypes.util.find_library('cygmagic-1') +dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') # This is necessary because find_library returns None if it doesn't find the library if dll: libmagic = ctypes.CDLL(dll) if not libmagic or not libmagic._name: - platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib', - '/usr/local/lib/libmagic.dylib'] + - # Assumes there will only be one version installed - glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'), - 'win32': ['magic1.dll','cygmagic-1.dll']} - for dll in platform_to_lib.get(sys.platform, []): + import sys + platform_to_lib = {'darwin': '/opt/local/lib/libmagic.dylib', + 'win32': 'magic1.dll'} + if sys.platform in platform_to_lib: try: - libmagic = ctypes.CDLL(dll) - break + libmagic = ctypes.CDLL(platform_to_lib[sys.platform]) except OSError: pass @@ -173,38 +132,13 @@ if not libmagic or not libmagic._name: magic_t = ctypes.c_void_p -def errorcheck_null(result, func, args): - if result is None: - err = magic_error(args[0]) +def errorcheck(result, func, args): + err = magic_error(args[0]) + if err is not None: raise MagicException(err) else: return result -def errorcheck_negative_one(result, func, args): - if result is -1: - err = magic_error(args[0]) - raise MagicException(err) - else: - return result - - -def coerce_filename(filename): - if filename is None: - return None - - # ctypes will implicitly convert unicode strings to bytes with - # .encode('ascii'). If you use the filesystem encoding - # then you'll get inconsistent behavior (crashes) depending on the user's - # LANG environment variable - is_unicode = (sys.version_info[0] <= 2 and - isinstance(filename, unicode)) or \ - (sys.version_info[0] >= 3 and - isinstance(filename, str)) - if is_unicode: - return filename.encode('utf-8') - else: - return filename - magic_open = libmagic.magic_open magic_open.restype = magic_t magic_open.argtypes = [c_int] @@ -221,30 +155,26 @@ magic_errno = libmagic.magic_errno magic_errno.restype = c_int magic_errno.argtypes = [magic_t] -_magic_file = libmagic.magic_file -_magic_file.restype = c_char_p -_magic_file.argtypes = [magic_t, c_char_p] -_magic_file.errcheck = errorcheck_null +magic_file = libmagic.magic_file +magic_file.restype = c_char_p +magic_file.argtypes = [magic_t, c_char_p] +magic_file.errcheck = errorcheck -def magic_file(cookie, filename): - return _magic_file(cookie, coerce_filename(filename)) _magic_buffer = libmagic.magic_buffer _magic_buffer.restype = c_char_p _magic_buffer.argtypes = [magic_t, c_void_p, c_size_t] -_magic_buffer.errcheck = errorcheck_null +_magic_buffer.errcheck = errorcheck + def magic_buffer(cookie, buf): return _magic_buffer(cookie, buf, len(buf)) -_magic_load = libmagic.magic_load -_magic_load.restype = c_int -_magic_load.argtypes = [magic_t, c_char_p] -_magic_load.errcheck = errorcheck_negative_one - -def magic_load(cookie, filename): - return _magic_load(cookie, coerce_filename(filename)) +magic_load = libmagic.magic_load +magic_load.restype = c_int +magic_load.argtypes = [magic_t, c_char_p] +magic_load.errcheck = errorcheck magic_setflags = libmagic.magic_setflags magic_setflags.restype = c_int diff --git a/libs/pytwmn.py b/libs/pytwmn.py index 6b2d774..49661fb 100644 --- a/libs/pytwmn.py +++ b/libs/pytwmn.py @@ -45,8 +45,8 @@ def init(host="127.0.0.1", port=None): class Notification(object): def __init__(self, title="", msg="", icon=""): - self.title = str(title) - self.msg = str(msg) + self.title = unicode(title) + self.msg = unicode(msg) if icon.startswith("file://"): icon = icon[7:] self.icon = icon diff --git a/libs/sgmllib.py b/libs/sgmllib.py deleted file mode 100644 index 88a02a3..0000000 --- a/libs/sgmllib.py +++ /dev/null @@ -1,547 +0,0 @@ -"""A parser for SGML, using the derived class as a static DTD.""" - -# XXX This only supports those SGML features used by HTML. - -# XXX There should be a way to distinguish between PCDATA (parsed -# character data -- the normal case), RCDATA (replaceable character -# data -- only char and entity references and end tags are special) -# and CDATA (character data -- only end tags are special). RCDATA is -# not supported at all. - -import _markupbase -import re - -__all__ = ["SGMLParser", "SGMLParseError"] - -# Regular expressions used for parsing - -interesting = re.compile('[&<]') -incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' - '<([a-zA-Z][^<>]*|' - '/([a-zA-Z][^<>]*)?|' - '![^<>]*)?') - -entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') -charref = re.compile('&#([0-9]+)[^0-9]') - -starttagopen = re.compile('<[>a-zA-Z]') -shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') -shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') -piclose = re.compile('>') -endbracket = re.compile('[<>]') -tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') - - -class SGMLParseError(RuntimeError): - """Exception raised for all parse errors.""" - pass - - -# SGML parser base class -- find tags and call handler functions. -# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). -# The dtd is defined by deriving a class which defines methods -# with special names to handle tags: start_foo and end_foo to handle -# and , respectively, or do_foo to handle by itself. -# (Tags are converted to lower case for this purpose.) The data -# between tags is passed to the parser by calling self.handle_data() -# with some data as argument (the data may be split up in arbitrary -# chunks). Entity references are passed by calling -# self.handle_entityref() with the entity reference as argument. - -class SGMLParser(_markupbase.ParserBase): - # Definition of entities -- derived classes may override - entity_or_charref = re.compile('&(?:' - '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' - ')(;?)') - - def __init__(self, verbose=0): - """Initialize and reset this instance.""" - self.verbose = verbose - self.reset() - - def reset(self): - """Reset this instance. Loses all unprocessed data.""" - self.__starttag_text = None - self.rawdata = '' - self.stack = [] - self.lasttag = '???' - self.nomoretags = 0 - self.literal = 0 - _markupbase.ParserBase.reset(self) - - def setnomoretags(self): - """Enter literal mode (CDATA) till EOF. - - Intended for derived classes only. - """ - self.nomoretags = self.literal = 1 - - def setliteral(self, *args): - """Enter literal mode (CDATA). - - Intended for derived classes only. - """ - self.literal = 1 - - def feed(self, data): - """Feed some data to the parser. - - Call this as often as you want, with as little or as much text - as you want (may include '\n'). (This just saves the text, - all the processing is done by goahead().) - """ - - self.rawdata = self.rawdata + data - self.goahead(0) - - def close(self): - """Handle the remaining data.""" - self.goahead(1) - - def error(self, message): - raise SGMLParseError(message) - - # Internal -- handle data as far as reasonable. May leave state - # and data to be processed by a subsequent call. If 'end' is - # true, force handling all data as if followed by EOF marker. - def goahead(self, end): - rawdata = self.rawdata - i = 0 - n = len(rawdata) - while i < n: - if self.nomoretags: - self.handle_data(rawdata[i:n]) - i = n - break - match = interesting.search(rawdata, i) - if match: j = match.start() - else: j = n - if i < j: - self.handle_data(rawdata[i:j]) - i = j - if i == n: break - if rawdata[i] == '<': - if starttagopen.match(rawdata, i): - if self.literal: - self.handle_data(rawdata[i]) - i = i+1 - continue - k = self.parse_starttag(i) - if k < 0: break - i = k - continue - if rawdata.startswith(" (i + 1): - self.handle_data("<") - i = i+1 - else: - # incomplete - break - continue - if rawdata.startswith(" send "%s"' % msg) try: self.socket.send(msg + bytes("\r\n", "ascii")) - except socket.error as se: + except socket.error, se: try: # a little dance of compatibility to get the errno errno = se.errno except AttributeError: @@ -161,12 +160,12 @@ class IRCClient: while not self._end: try: buffer += self.socket.recv(1024) - except socket.timeout as e: + except socket.timeout, e: if self._end: break logging.debug("timeout in client.py") raise e - except socket.error as e: + except socket.error, e: if self._end: break logging.debug("error %s" % e) @@ -196,16 +195,16 @@ class IRCClient: pass yield True - except socket.timeout as se: + except socket.timeout, se: logging.debug("passing timeout") raise se - except socket.error as se: + except socket.error, se: logging.debug("problem: %s" % (se)) if self.socket: logging.info('error: closing socket') self.socket.close() raise se - except Exception as e: + except Exception, e: logging.debug("other exception: %s" % e) raise e else: @@ -254,7 +253,7 @@ class IRCApp: garuntee the callback will be called after seconds has passed. ( the only advantage to these timers is they dont use threads ) """ - assert isinstance(cb, collections.Callable) + assert callable(cb) logging.info('added timer to call %s in %ss' % (cb, seconds)) self._timers.append((time.time() + seconds, cb)) @@ -265,13 +264,13 @@ class IRCApp: while self.running: found_one_alive = False - for client, clientdesc in self._clients.items(): + for client, clientdesc in self._clients.iteritems(): if clientdesc.con is None: clientdesc.con = client.connect() try: - next(clientdesc.con) - except Exception as e: + clientdesc.con.next() + except Exception, e: logging.error('client error %s' % e) logging.error(traceback.format_exc()) if clientdesc.autoreconnect: diff --git a/oyoyo/cmdhandler.py b/oyoyo/cmdhandler.py index a7a8a86..778020e 100644 --- a/oyoyo/cmdhandler.py +++ b/oyoyo/cmdhandler.py @@ -65,17 +65,13 @@ class CommandHandler(object): its possible to pass both "command.sub.func" and ["command", "sub", "func"]. """ - if isinstance(in_command_parts, (bytes)): + if isinstance(in_command_parts, (str, bytes)): in_command_parts = in_command_parts.split(bytes('.', 'ascii')) - elif isinstance(in_command_parts, (str)): - in_command_parts = in_command_parts.split('.') command_parts = in_command_parts[:] p = self while command_parts: - cmd = command_parts.pop(0) - if type(cmd) is bytes: - cmd = cmd.decode('utf-8') + cmd = command_parts.pop(0).decode('ascii') if cmd.startswith('_'): raise ProtectedCommandError(in_command_parts) @@ -109,7 +105,7 @@ class CommandHandler(object): try: f(*args) - except Exception as e: + except Exception, e: logging.error('command raised %s' % e) logging.error(traceback.format_exc()) raise CommandError(command) @@ -155,7 +151,7 @@ class DefaultBotCommandHandler(CommandHandler): else: try: f = self.get(arg) - except CommandError as e: + except CommandError, e: helpers.msg(self.client, dest, str(e)) return @@ -202,7 +198,7 @@ class BotCommandHandler(DefaultCommandHandler): try: self.command_handler.run(command, prefix, dest, *arg) - except CommandError as e: + except CommandError, e: helpers.msg(self.client, dest, str(e)) return True diff --git a/oyoyo/examplebot.py b/oyoyo/examplebot.py index dfd1885..81aac02 100644 --- a/oyoyo/examplebot.py +++ b/oyoyo/examplebot.py @@ -21,7 +21,7 @@ class MyHandler(DefaultCommandHandler): match = re.match('\!say (.*)', msg) if match: to_say = match.group(1).strip() - print(('Saying, "%s"' % to_say)) + print('Saying, "%s"' % to_say) helpers.msg(self.client, chan, to_say) @@ -37,7 +37,7 @@ def main(): conn = cli.connect() while True: - next(conn) ## python 2 + conn.next() ## python 2 # next(conn) ## python 3 diff --git a/oyoyo/helpers.py b/oyoyo/helpers.py index 5c25b59..c82ec9c 100644 --- a/oyoyo/helpers.py +++ b/oyoyo/helpers.py @@ -111,7 +111,7 @@ def _addNumerics(): cli.send(cmd_num, *args) return f m = sys.modules[__name__] - for num, name in ircevents.numeric_events.items(): + for num, name in ircevents.numeric_events.iteritems(): setattr(m, name, numericcmd(num, name)) _addNumerics() diff --git a/oyoyo/ircevents.py b/oyoyo/ircevents.py index a1bda3c..6d8969b 100644 --- a/oyoyo/ircevents.py +++ b/oyoyo/ircevents.py @@ -179,8 +179,6 @@ numeric_events = { "502": "usersdontmatch", } -numeric_events = {bytes(k, 'ascii'):v for k, v in numeric_events.items()} - generated_events = [ # Generated events "dcc_connect", @@ -208,5 +206,5 @@ protocol_events = [ "pong", ] -all_events = generated_events + protocol_events + list(numeric_events.values()) +all_events = generated_events + protocol_events + numeric_events.values() diff --git a/oyoyo/services.py b/oyoyo/services.py index 751a787..9183beb 100644 --- a/oyoyo/services.py +++ b/oyoyo/services.py @@ -1,5 +1,5 @@ import sys -from .helpers import msg +from helpers import msg # NickServ basic functions _nickservfuncs = ( @@ -103,7 +103,7 @@ def _addServ(serv, funcs, prefix=""): if prefix: cmd_name = prefix.upper() + " " + cmd_name def f(cli, *args): - print(cmd_name, " ".join(args)) + print cmd_name, " ".join(args) #cli.send(cmd_name, serv.name, *args) return f for t in funcs: diff --git a/parsetools.py b/parsetools.py index c834020..4abceed 100644 --- a/parsetools.py +++ b/parsetools.py @@ -29,7 +29,7 @@ quirkloader = ScriptQuirks() quirkloader.add(PythonQuirks()) quirkloader.add(LuaQuirks()) quirkloader.loadAll() -print(quirkloader.funcre()) +print quirkloader.funcre() _functionre = re.compile(r"%s" % quirkloader.funcre()) _groupre = re.compile(r"\\([0-9]+)") @@ -44,7 +44,7 @@ def lexer(string, objlist): for (oType, regexp) in objlist: newstringlist = [] for (stri, s) in enumerate(stringlist): - if type(s) not in [str]: + if type(s) not in [str, unicode]: newstringlist.append(s) continue lasti = 0 @@ -207,9 +207,9 @@ def lexMessage(string): (smiley, _smilere), (honker, _honk)] - string = str(string) + string = unicode(string) string = string.replace("\n", " ").replace("\r", " ") - lexed = lexer(str(string), lexlist) + lexed = lexer(unicode(string), lexlist) balanced = [] beginc = 0 @@ -231,7 +231,7 @@ def lexMessage(string): balanced.append(colorEnd("")) if len(balanced) == 0: balanced.append("") - if type(balanced[len(balanced)-1]) not in [str]: + if type(balanced[len(balanced)-1]) not in [str, unicode]: balanced.append("") return balanced @@ -239,12 +239,12 @@ def convertTags(lexed, format="html"): if format not in ["html", "bbcode", "ctag", "text"]: raise ValueError("Color format not recognized") - if type(lexed) in [str]: + if type(lexed) in [str, unicode]: lexed = lexMessage(lexed) escaped = "" firststr = True for (i, o) in enumerate(lexed): - if type(o) in [str]: + if type(o) in [str, unicode]: if format == "html": escaped += o.replace("&", "&").replace(">", ">").replace("<","<") else: @@ -259,7 +259,7 @@ def splitMessage(msg, format="ctag"): # split long text lines buf = [] for o in msg: - if type(o) in [str] and len(o) > 200: + if type(o) in [str, unicode] and len(o) > 200: for i in range(0, len(o), 200): buf.append(o[i:i+200]) else: @@ -401,7 +401,7 @@ def parseRegexpFunctions(to): backr = _groupre.search(mo.group()) if backr is not None: current.append(backreference(backr.group(1))) - elif mo.group()[:-1] in list(functiondict.keys()): + elif mo.group()[:-1] in functiondict.keys(): p = parseLeaf(functiondict[mo.group()[:-1]], current) current.append(p) current = p @@ -418,7 +418,7 @@ def parseRegexpFunctions(to): def img2smiley(string): - string = str(string) + string = unicode(string) def imagerep(mo): return reverse_smiley[mo.group(1)] string = re.sub(r'', imagerep, string) @@ -499,8 +499,8 @@ if ostools.isOSXBundle(): -reverse_smiley = dict((v,k) for k, v in smiledict.items()) -_smilere = re.compile("|".join(list(smiledict.keys()))) +reverse_smiley = dict((v,k) for k, v in smiledict.iteritems()) +_smilere = re.compile("|".join(smiledict.keys())) class ThemeException(Exception): def __init__(self, value): diff --git a/pesterchum.py b/pesterchum.py index 115a66f..7bff675 100644 --- a/pesterchum.py +++ b/pesterchum.py @@ -9,22 +9,28 @@ from datetime import * import random import re from time import time -import threading, queue +import threading, Queue reqmissing = [] optmissing = [] try: - from PyQt5 import QtGui, QtCore, QtWidgets, QtMultimedia -except ImportError as e: + from PyQt5 import QtGui, QtCore, QtWidgets +except ImportError, e: module = str(e) if module.startswith("No module named ") or \ module.startswith("cannot import name "): reqmissing.append(module[module.rfind(" ")+1:]) - else: print(e) - + else: print e +try: + import pygame +except ImportError, e: + pygame = None + module = str(e) + if module[:16] == "No module named ": optmissing.append(module[16:]) + else: print e if reqmissing: - print("ERROR: The following modules are required for Pesterchum to run and are missing on your system:") - for m in reqmissing: print("* "+m) + print "ERROR: The following modules are required for Pesterchum to run and are missing on your system:" + for m in reqmissing: print "* "+m exit() vnum = QtCore.qVersion() major = int(vnum[:vnum.find(".")]) @@ -33,8 +39,8 @@ if vnum.find(".", vnum.find(".")+1) != -1: else: minor = int(vnum[vnum.find(".")+1:]) if not ((major > 4) or (major == 4 and minor >= 6)): - print("ERROR: Pesterchum requires Qt version >= 4.6") - print("You currently have version " + vnum + ". Please upgrade Qt") + print "ERROR: Pesterchum requires Qt version >= 4.6" + print "You currently have version " + vnum + ". Please upgrade Qt" exit() import ostools @@ -107,7 +113,7 @@ class waitingMessageHolder(object): def __init__(self, mainwindow, **msgfuncs): self.mainwindow = mainwindow self.funcs = msgfuncs - self.queue = list(msgfuncs.keys()) + self.queue = msgfuncs.keys() if len(self.queue) > 0: self.mainwindow.updateSystemTray() def waitingHandles(self): @@ -123,7 +129,7 @@ class waitingMessageHolder(object): if len(self.queue) == 0: self.mainwindow.updateSystemTray() def addMessage(self, handle, func): - if handle not in self.funcs: + if not self.funcs.has_key(handle): self.queue.append(handle) self.funcs[handle] = func if len(self.queue) > 0: @@ -276,13 +282,13 @@ class chumArea(RightClickTree): @QtCore.pyqtSlot() def beginNotify(self): - print("BEGIN NOTIFY") + print "BEGIN NOTIFY" self.notify = True def getOptionsMenu(self): if not self.currentItem(): return None - text = str(self.currentItem().text(0)) + text = unicode(self.currentItem().text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] if text == "Chums": @@ -328,13 +334,13 @@ class chumArea(RightClickTree): if thisitem.rfind(" (") != -1: thisitem = thisitem[0:thisitem.rfind(" (")] # Drop item is a group - thisitem = str(event.source().currentItem().text(0)) + thisitem = unicode(event.source().currentItem().text(0)) if thisitem.rfind(" (") != -1: thisitem = thisitem[0:thisitem.rfind(" (")] if thisitem == "Chums" or thisitem in self.groups: droppos = self.itemAt(event.pos()) if not droppos: return - droppos = str(droppos.text(0)) + droppos = unicode(droppos.text(0)) if droppos.rfind(" ") != -1: droppos = droppos[0:droppos.rfind(" ")] if droppos == "Chums" or droppos in self.groups: @@ -347,16 +353,16 @@ class chumArea(RightClickTree): gTemp = [] for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] - gTemp.append([str(text), self.topLevelItem(i).isExpanded()]) + gTemp.append([unicode(text), self.topLevelItem(i).isExpanded()]) self.mainwindow.config.saveGroups(gTemp) # Drop item is a chum else: item = self.itemAt(event.pos()) if item: - text = str(item.text(0)) + text = unicode(item.text(0)) # Figure out which group to drop into if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] @@ -364,7 +370,7 @@ class chumArea(RightClickTree): group = text gitem = item else: - ptext = str(item.parent().text(0)) + ptext = unicode(item.parent().text(0)) if ptext.rfind(" ") != -1: ptext = ptext[0:ptext.rfind(" ")] group = ptext @@ -387,7 +393,7 @@ class chumArea(RightClickTree): if chums.index(thisitem) < inPos: inPos -= 1 chums.remove(thisitem) - chums.insert(inPos, str(thisitem)) + chums.insert(inPos, unicode(thisitem)) self.mainwindow.config.setChums(chums) else: @@ -399,9 +405,9 @@ class chumArea(RightClickTree): currentGroup = self.currentItem() if currentGroup: if currentGroup.parent(): - text = str(currentGroup.parent().text(0)) + text = unicode(currentGroup.parent().text(0)) else: - text = str(currentGroup.text(0)) + text = unicode(currentGroup.text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] currentGroup = text @@ -459,7 +465,7 @@ class chumArea(RightClickTree): return curgroups = [] for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] curgroups.append(text) @@ -483,31 +489,31 @@ class chumArea(RightClickTree): totals = {'Chums': 0} online = {'Chums': 0} for g in self.groups: - totals[str(g)] = 0 - online[str(g)] = 0 + totals[unicode(g)] = 0 + online[unicode(g)] = 0 for c in self.chums: yes = c.mood.name() != "offline" if c.group == "Chums": - totals[str(c.group)] = totals[str(c.group)]+1 + totals[unicode(c.group)] = totals[unicode(c.group)]+1 if yes: - online[str(c.group)] = online[str(c.group)]+1 + online[unicode(c.group)] = online[unicode(c.group)]+1 elif c.group in totals: - totals[str(c.group)] = totals[str(c.group)]+1 + totals[unicode(c.group)] = totals[unicode(c.group)]+1 if yes: - online[str(c.group)] = online[str(c.group)]+1 + online[unicode(c.group)] = online[unicode(c.group)]+1 else: totals["Chums"] = totals["Chums"]+1 if yes: online["Chums"] = online["Chums"]+1 for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] if text in online: self.topLevelItem(i).setText(0, "%s (%i/%i)" % (text, online[text], totals[text])) def hideOnlineNumbers(self): for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] self.topLevelItem(i).setText(0, "%s" % (text)) @@ -523,7 +529,7 @@ class chumArea(RightClickTree): @QtCore.pyqtSlot() def expandGroup(self): item = self.currentItem() - text = str(item.text(0)) + text = unicode(item.text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] @@ -538,7 +544,7 @@ class chumArea(RightClickTree): self.mainwindow.config.addGroup("Chums") curgroups = [] for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] curgroups.append(text) @@ -555,7 +561,7 @@ class chumArea(RightClickTree): if self.openGroups[self.groups.index("%s" % (chumLabel.chum.group))]: child_1.setExpanded(True) for i in range(self.topLevelItemCount()): - text = str(self.topLevelItem(i).text(0)) + text = unicode(self.topLevelItem(i).text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] if text == chumLabel.chum.group: @@ -574,7 +580,7 @@ class chumArea(RightClickTree): bestname = "" if fi > 0: while not bestj: - for j in range(self.topLevelItem(i).childCount()): + for j in xrange(self.topLevelItem(i).childCount()): if chums[fi-c] == str(self.topLevelItem(i).child(j).text(0)): bestj = j bestname = chums[fi-c] @@ -649,7 +655,7 @@ class chumArea(RightClickTree): def initTheme(self, theme): self.resize(*theme["main/chums/size"]) self.move(*theme["main/chums/loc"]) - if "main/chums/scrollbar" in theme: + if theme.has_key("main/chums/scrollbar"): self.setStyleSheet("QListWidget { %s } QScrollBar { %s } QScrollBar::handle { %s } QScrollBar::add-line { %s } QScrollBar::sub-line { %s } QScrollBar:up-arrow { %s } QScrollBar:down-arrow { %s }" % (theme["main/chums/style"], theme["main/chums/scrollbar/style"], theme["main/chums/scrollbar/handle"], theme["main/chums/scrollbar/downarrow"], theme["main/chums/scrollbar/uparrow"], theme["main/chums/scrollbar/uarrowstyle"], theme["main/chums/scrollbar/darrowstyle"] )) else: self.setStyleSheet(theme["main/chums/style"]) @@ -757,7 +763,7 @@ class chumArea(RightClickTree): return (notes, ok) = QtWidgets.QInputDialog.getText(self, "Notes", "Enter your notes...") if ok: - notes = str(notes) + notes = unicode(notes) self.mainwindow.chumdb.setNotes(currentChum.handle, notes) currentChum.setToolTip(0, "%s: %s" % (currentChum.handle, notes)) @QtCore.pyqtSlot() @@ -767,7 +773,7 @@ class chumArea(RightClickTree): if not self.renamegroupdialog: (gname, ok) = QtWidgets.QInputDialog.getText(self, "Rename Group", "Enter a new name for the group:") if ok: - gname = str(gname) + gname = unicode(gname) if re.search("[^A-Za-z0-9_\s]", gname) is not None: msgbox = QtWidgets.QMessageBox() msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME") @@ -781,7 +787,7 @@ class chumArea(RightClickTree): index = self.indexOfTopLevelItem(currentGroup) if index != -1: expanded = currentGroup.isExpanded() - text = str(currentGroup.text(0)) + text = unicode(currentGroup.text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] self.mainwindow.config.delGroup(text) @@ -801,7 +807,7 @@ class chumArea(RightClickTree): currentGroup = self.currentItem() if not currentGroup: return - text = str(currentGroup.text(0)) + text = unicode(currentGroup.text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] self.mainwindow.config.delGroup(text) @@ -824,7 +830,7 @@ class chumArea(RightClickTree): def moveToGroup(self, item): if not item: return - group = str(item.text()) + group = unicode(item.text()) chumLabel = self.currentItem() if not chumLabel: return @@ -939,7 +945,7 @@ class TrollSlumWindow(QtWidgets.QFrame): self.addtrolldialog = QtWidgets.QInputDialog(self) (handle, ok) = self.addtrolldialog.getText(self, "Add Troll", "Enter Troll Handle:") if ok: - handle = str(handle) + handle = unicode(handle) if not (PesterProfile.checkLength(handle) and PesterProfile.checkValid(handle)[0]): errormsg = QtWidgets.QErrorMessage(self) @@ -990,9 +996,8 @@ class PesterWindow(MovingWindow): try: themeChecker(self.theme) - except ThemeException as xxx_todo_changeme: - (inst) = xxx_todo_changeme - print("Caught: "+inst.parameter) + except ThemeException, (inst): + print "Caught: "+inst.parameter themeWarning = QtWidgets.QMessageBox(self) themeWarning.setText("Theme Error: %s" % (inst)) themeWarning.exec_() @@ -1152,7 +1157,7 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot() def updatePC(self): - version.updateDownload(str(self.updatemenu.url)) + version.updateDownload(unicode(self.updatemenu.url)) self.updatemenu = None @QtCore.pyqtSlot() def noUpdatePC(self): @@ -1208,7 +1213,7 @@ class PesterWindow(MovingWindow): return # notify if self.config.notifyOptions() & self.config.NEWMSG: - if handle not in self.convos: + if not self.convos.has_key(handle): t = self.tm.Toast("New Conversation", "From: %s" % handle) t.show() elif not self.config.notifyOptions() & self.config.NEWCONVO: @@ -1226,7 +1231,7 @@ class PesterWindow(MovingWindow): elif msg == "PESTERCHUM:UNBLOCK": t = self.tm.Toast("Unblocked", handle) t.show() - if handle not in self.convos: + if not self.convos.has_key(handle): if msg == "PESTERCHUM:CEASE": # ignore cease after we hang up return matchingChums = [c for c in self.chumList.chums if c.handle == handle] @@ -1248,12 +1253,12 @@ class PesterWindow(MovingWindow): else: self.alarm.play() def newMemoMsg(self, chan, handle, msg): - if chan not in self.memos: + if not self.memos.has_key(chan): # silently ignore in case we forgot to /part return memo = self.memos[chan] - msg = str(msg) - if handle not in memo.times: + msg = unicode(msg) + if not memo.times.has_key(handle): # new chum! time current newtime = timedelta(0) time = TimeTracker(newtime) @@ -1291,19 +1296,19 @@ class PesterWindow(MovingWindow): def changeColor(self, handle, color): # pesterconvo and chumlist self.chumList.updateColor(handle, color) - if handle in self.convos: + if self.convos.has_key(handle): self.convos[handle].updateColor(color) self.chumdb.setColor(handle, color) def updateMood(self, handle, mood): # updates OTHER chums' moods oldmood = self.chumList.updateMood(handle, mood) - if handle in self.convos: + if self.convos.has_key(handle): self.convos[handle].updateMood(mood, old=oldmood) if hasattr(self, 'trollslum') and self.trollslum: self.trollslum.updateMood(handle, mood) def newConversation(self, chum, initiated=True): - if type(chum) in [str, str]: + if type(chum) in [str, unicode]: matchingChums = [c for c in self.chumList.chums if c.handle == chum] if len(matchingChums) > 0: mood = matchingChums[0].mood @@ -1313,7 +1318,7 @@ class PesterWindow(MovingWindow): if len(matchingChums) == 0: self.moodRequest.emit(chum) - if chum.handle in self.convos: + if self.convos.has_key(chum.handle): self.convos[chum.handle].showChat() return if self.config.tabs(): @@ -1326,10 +1331,10 @@ class PesterWindow(MovingWindow): convoWindow.messageSent.connect(self.sendMessage) convoWindow.windowClosed.connect(self.closeConvo) self.convos[chum.handle] = convoWindow - if str(chum.handle).upper() in BOTNAMES: + if unicode(chum.handle).upper() in BOTNAMES: convoWindow.toggleQuirks(True) convoWindow.quirksOff.setChecked(True) - if str(chum.handle).upper() in CUSTOMBOTS: + if unicode(chum.handle).upper() in CUSTOMBOTS: self.newConvoStarted.emit(chum.handle, initiated) else: self.newConvoStarted.emit(chum.handle, initiated) @@ -1345,7 +1350,7 @@ class PesterWindow(MovingWindow): def newMemo(self, channel, timestr, secret=False, invite=False): if channel == "#pesterchum": return - if channel in self.memos: + if self.memos.has_key(channel): self.memos[channel].showChat() return # do slider dialog then set @@ -1460,19 +1465,19 @@ class PesterWindow(MovingWindow): if hasattr(self, 'moods'): self.moods.removeButtons() mood_list = theme["main/moods"] - mood_list = [dict([(str(k),v) for (k,v) in d.items()]) + mood_list = [dict([(str(k),v) for (k,v) in d.iteritems()]) for d in mood_list] self.moods = PesterMoodHandler(self, *[PesterMoodButton(self, **d) for d in mood_list]) self.moods.showButtons() # chum addChumStyle = "QPushButton { %s }" % (theme["main/addchum/style"]) - if "main/addchum/pressed" in theme: + if theme.has_key("main/addchum/pressed"): addChumStyle += "QPushButton:pressed { %s }" % (theme["main/addchum/pressed"]) pesterButtonStyle = "QPushButton { %s }" % (theme["main/pester/style"]) - if "main/pester/pressed" in theme: + if theme.has_key("main/pester/pressed"): pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/pester/pressed"]) blockButtonStyle = "QPushButton { %s }" % (theme["main/block/style"]) - if "main/block/pressed" in theme: + if theme.has_key("main/block/pressed"): pesterButtonStyle += "QPushButton:pressed { %s }" % (theme["main/block/pressed"]) self.addChumButton.setText(theme["main/addchum/text"]) self.addChumButton.resize(*theme["main/addchum/size"]) @@ -1497,7 +1502,7 @@ class PesterWindow(MovingWindow): self.mychumcolor.resize(*theme["main/mychumhandle/colorswatch/size"]) self.mychumcolor.move(*theme["main/mychumhandle/colorswatch/loc"]) self.mychumcolor.setStyleSheet("background: %s" % (self.profile().colorhtml())) - if "main/mychumhandle/currentMood" in self.theme: + if self.theme.has_key("main/mychumhandle/currentMood"): moodicon = self.profile().mood.icon(theme) if hasattr(self, 'currentMoodIcon') and self.currentMoodIcon: self.currentMoodIcon.hide() @@ -1518,36 +1523,40 @@ class PesterWindow(MovingWindow): self.mychumcolor.setText("") # sounds - try: - self.alarm, self.memosound, self.namesound, self.ceasesound, self.honksound = \ - [QtMultimedia.QSoundEffect() for i in range(5)] - self.alarm.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/alertsound"])) - self.memosound.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/memosound"])) - self.namesound.setSource(QtCore.QUrl.fromLocalFile("themes/namealarm.wav")) - self.ceasesound.setSource(QtCore.QUrl.fromLocalFile(theme["main/sounds/ceasesound"])) - self.honksound.setSource(QtCore.QUrl.fromLocalFile("themes/honk.wav")) - except Exception as e: + if not pygame or not pygame.mixer: self.alarm = NoneSound() self.memosound = NoneSound() self.namesound = NoneSound() self.ceasesound = NoneSound() self.honksound = NoneSound() + else: + try: + self.alarm = pygame.mixer.Sound(theme["main/sounds/alertsound"]) + self.memosound = pygame.mixer.Sound(theme["main/sounds/memosound"]) + self.namesound = pygame.mixer.Sound("themes/namealarm.wav") + self.ceasesound = pygame.mixer.Sound(theme["main/sounds/ceasesound"]) + self.honksound = pygame.mixer.Sound("themes/honk.wav") + except Exception, e: + self.alarm = NoneSound() + self.memosound = NoneSound() + self.namesound = NoneSound() + self.ceasesound = NoneSound() + self.honksound = NoneSound() self.setVolume(self.config.volume()) def setVolume(self, vol): vol = vol/100.0 - self.alarm.setVolume(vol) - self.memosound.setVolume(vol) - self.namesound.setVolume(vol) - self.ceasesound.setVolume(vol) - self.honksound.setVolume(vol) + self.alarm.set_volume(vol) + self.memosound.set_volume(vol) + self.namesound.set_volume(vol) + self.ceasesound.set_volume(vol) + self.honksound.set_volume(vol) def changeTheme(self, theme): # check theme try: themeChecker(theme) - except ThemeException as xxx_todo_changeme1: - (inst) = xxx_todo_changeme1 + except ThemeException, (inst): themeWarning = QtWidgets.QMessageBox(self) themeWarning.setText("Theme Error: %s" % (inst)) themeWarning.exec_() @@ -1630,7 +1639,7 @@ class PesterWindow(MovingWindow): def pesterSelectedChum(self): curChum = self.chumList.currentItem() if curChum: - text = str(curChum.text(0)) + text = unicode(curChum.text(0)) if text.rfind(" (") != -1: text = text[0:text.rfind(" (")] if text not in self.chumList.groups and \ @@ -1646,7 +1655,7 @@ class PesterWindow(MovingWindow): self.newConversation(chum) @QtCore.pyqtSlot('QString') def closeConvo(self, handle): - h = str(handle) + h = unicode(handle) try: chum = self.convos[h].chum except KeyError: @@ -1662,7 +1671,7 @@ class PesterWindow(MovingWindow): del self.convos[h] @QtCore.pyqtSlot('QString') def closeMemo(self, channel): - c = str(channel) + c = unicode(channel) self.chatlog.finish(c) self.leftChannel.emit(channel) try: @@ -1680,27 +1689,27 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot('QString', Mood) def updateMoodSlot(self, handle, mood): - h = str(handle) + h = unicode(handle) self.updateMood(h, mood) @QtCore.pyqtSlot('QString', QtGui.QColor) def updateColorSlot(self, handle, color): - h = str(handle) + h = unicode(handle) self.changeColor(h, color) @QtCore.pyqtSlot('QString', 'QString') def deliverMessage(self, handle, msg): - h = str(handle) - m = str(msg) + h = unicode(handle) + m = unicode(msg) self.newMessage(h, m) @QtCore.pyqtSlot('QString', 'QString', 'QString') def deliverMemo(self, chan, handle, msg): - (c, h, m) = (str(chan), str(handle), str(msg)) + (c, h, m) = (unicode(chan), unicode(handle), unicode(msg)) self.newMemoMsg(c,h,m) @QtCore.pyqtSlot('QString', 'QString') def deliverNotice(self, handle, msg): - h = str(handle) - m = str(msg) + h = unicode(handle) + m = unicode(msg) if m.startswith("Your nickname is now being changed to"): changedto = m[39:-1] msgbox = QtWidgets.QMessageBox() @@ -1710,7 +1719,7 @@ class PesterWindow(MovingWindow): ret = msgbox.exec_() elif h == self.randhandler.randNick: self.randhandler.incoming(msg) - elif h in self.convos: + elif self.convos.has_key(h): self.newMessage(h, m) elif h.upper() == "NICKSERV" and "PESTERCHUM:" not in m: m = nickservmsgs.translate(m) @@ -1725,7 +1734,7 @@ class PesterWindow(MovingWindow): msgbox.setStandardButtons(QtWidgets.QMessageBox.Ok | QtWidgets.QMessageBox.Cancel) ret = msgbox.exec_() if ret == QtWidgets.QMessageBox.Ok: - self.newMemo(str(channel), "+0:00") + self.newMemo(unicode(channel), "+0:00") @QtCore.pyqtSlot('QString') def chanInviteOnly(self, channel): self.inviteOnlyChan.emit(channel) @@ -1737,35 +1746,35 @@ class PesterWindow(MovingWindow): self.modesUpdated.emit(channel, modes) @QtCore.pyqtSlot('QString', 'QString', 'QString') def timeCommand(self, chan, handle, command): - (c, h, cmd) = (str(chan), str(handle), str(command)) + (c, h, cmd) = (unicode(chan), unicode(handle), unicode(command)) if self.memos[c]: self.memos[c].timeUpdate(h, cmd) @QtCore.pyqtSlot('QString', 'QString', 'QString') def quirkDisable(self, channel, msg, op): - (c, msg, op) = (str(channel), str(msg), str(op)) - if c not in self.memos: + (c, msg, op) = (unicode(channel), unicode(msg), unicode(op)) + if not self.memos.has_key(c): return memo = self.memos[c] memo.quirkDisable(op, msg) @QtCore.pyqtSlot('QString', PesterList) def updateNames(self, channel, names): - c = str(channel) + c = unicode(channel) # update name DB self.namesdb[c] = names # warn interested party of names self.namesUpdated.emit(c) @QtCore.pyqtSlot('QString', 'QString', 'QString') def userPresentUpdate(self, handle, channel, update): - c = str(channel) - n = str(handle) + c = unicode(channel) + n = unicode(handle) if update == "nick": l = n.split(":") oldnick = l[0] newnick = l[1] if update in ("quit", "netsplit"): - for c in list(self.namesdb.keys()): + for c in self.namesdb.keys(): try: i = self.namesdb[c].index(n) self.namesdb[c].pop(i) @@ -1782,7 +1791,7 @@ class PesterWindow(MovingWindow): except KeyError: self.namesdb[c] = [] elif update == "nick": - for c in list(self.namesdb.keys()): + for c in self.namesdb.keys(): try: i = self.namesdb[c].index(oldnick) self.namesdb[c].pop(i) @@ -1809,12 +1818,12 @@ class PesterWindow(MovingWindow): available_groups = [g[0] for g in self.config.getGroups()] self.addchumdialog = AddChumDialog(available_groups, self) ok = self.addchumdialog.exec_() - handle = str(self.addchumdialog.chumBox.text()).strip() - newgroup = str(self.addchumdialog.newgroup.text()).strip() + handle = unicode(self.addchumdialog.chumBox.text()).strip() + newgroup = unicode(self.addchumdialog.newgroup.text()).strip() selectedGroup = self.addchumdialog.groupBox.currentText() group = newgroup if newgroup else selectedGroup if ok: - handle = str(handle) + handle = unicode(handle) if handle in [h.handle for h in self.chumList.chums]: self.addchumdialog = None return @@ -1846,10 +1855,10 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot('QString') def blockChum(self, handle): - h = str(handle) + h = unicode(handle) self.config.addBlocklist(h) self.config.removeChum(h) - if h in self.convos: + if self.convos.has_key(h): convo = self.convos[h] msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/blocked"]) convo.textArea.append(convertTags(msg)) @@ -1864,9 +1873,9 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot('QString') def unblockChum(self, handle): - h = str(handle) + h = unicode(handle) self.config.delBlocklist(h) - if h in self.convos: + if self.convos.has_key(h): convo = self.convos[h] msg = self.profile().pestermsg(convo.chum, QtGui.QColor(self.theme["convo/systemMsgColor"]), self.theme["convo/text/unblocked"]) convo.textArea.append(convertTags(msg)) @@ -1887,7 +1896,7 @@ class PesterWindow(MovingWindow): self.randhandler.setIdle(True) sysColor = QtGui.QColor(self.theme["convo/systemMsgColor"]) verb = self.theme["convo/text/idle"] - for (h, convo) in self.convos.items(): + for (h, convo) in self.convos.iteritems(): if convo.chumopen: msg = self.profile().idlemsg(sysColor, verb) convo.textArea.append(convertTags(msg)) @@ -1921,7 +1930,7 @@ class PesterWindow(MovingWindow): return fp = open(f, 'r') regexp_state = None - for l in fp: + for l in fp.xreadlines(): # import chumlist l = l.rstrip() chum_mo = re.match("handle: ([A-Za-z0-9]+)", l) @@ -1935,7 +1944,7 @@ class PesterWindow(MovingWindow): replace = replace_mo.group(1) try: re.compile(regexp_state) - except re.error as e: + except re.error, e: continue newquirk = pesterQuirk({"type": "regexp", "from": regexp_state, @@ -1971,18 +1980,18 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot() def joinSelectedMemo(self): - time = str(self.memochooser.timeinput.text()) + time = unicode(self.memochooser.timeinput.text()) secret = self.memochooser.secretChannel.isChecked() invite = self.memochooser.inviteChannel.isChecked() if self.memochooser.newmemoname(): newmemo = self.memochooser.newmemoname() - channel = "#"+str(newmemo).replace(" ", "_") + channel = "#"+unicode(newmemo).replace(" ", "_") channel = re.sub(r"[^A-Za-z0-9#_]", "", channel) self.newMemo(channel, time, secret=secret, invite=invite) for SelectedMemo in self.memochooser.SelectedMemos(): - channel = "#"+str(SelectedMemo.target) + channel = "#"+unicode(SelectedMemo.target) self.newMemo(channel, time) self.memochooser = None @@ -2009,12 +2018,12 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot('QString') def userListAdd(self, handle): - h = str(handle) + h = unicode(handle) chum = PesterProfile(h, chumdb=self.chumdb) self.addChum(chum) @QtCore.pyqtSlot('QString') def userListPester(self, handle): - h = str(handle) + h = unicode(handle) self.newConversation(h) @QtCore.pyqtSlot() def userListClose(self): @@ -2034,7 +2043,7 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot() def updateQuirks(self): for i in range(self.quirkmenu.quirkList.topLevelItemCount()): - curgroup = str(self.quirkmenu.quirkList.topLevelItem(i).text(0)) + curgroup = unicode(self.quirkmenu.quirkList.topLevelItem(i).text(0)) for j in range(self.quirkmenu.quirkList.topLevelItem(i).childCount()): item = self.quirkmenu.quirkList.topLevelItem(i).child(j) item.quirk.quirk["on"] = item.quirk.on = (item.checkState(0) == QtCore.Qt.Checked) @@ -2057,7 +2066,7 @@ class PesterWindow(MovingWindow): (chum, ok) = QtWidgets.QInputDialog.getText(self, "Pester Chum", "Enter a handle to pester:") try: if ok: - self.newConversation(str(chum)) + self.newConversation(unicode(chum)) except: pass finally: @@ -2085,7 +2094,7 @@ class PesterWindow(MovingWindow): if not self.addgroupdialog: (gname, ok) = QtWidgets.QInputDialog.getText(self, "Add Group", "Enter a name for the new group:") if ok: - gname = str(gname) + gname = unicode(gname) if re.search("[^A-Za-z0-9_\s]", gname) is not None: msgbox = QtWidgets.QMessageBox() msgbox.setInformativeText("THIS IS NOT A VALID GROUP NAME") @@ -2135,7 +2144,7 @@ class PesterWindow(MovingWindow): # combine self.createTabWindow() newconvos = {} - for (h,c) in self.convos.items(): + for (h,c) in self.convos.iteritems(): c.setParent(self.tabconvo) self.tabconvo.addChat(c) self.tabconvo.show() @@ -2165,7 +2174,7 @@ class PesterWindow(MovingWindow): # combine newmemos = {} self.createMemoTabWindow() - for (h,m) in self.memos.items(): + for (h,m) in self.memos.iteritems(): m.setParent(self.tabmemo) self.tabmemo.addChat(m) self.tabmemo.show() @@ -2214,7 +2223,7 @@ class PesterWindow(MovingWindow): # timestamps timestampsetting = self.optionmenu.timestampcheck.isChecked() self.config.set("showTimeStamps", timestampsetting) - timeformatsetting = str(self.optionmenu.timestampBox.currentText()) + timeformatsetting = unicode(self.optionmenu.timestampBox.currentText()) if timeformatsetting == "12 hour": self.config.set("time12Format", True) else: @@ -2324,7 +2333,7 @@ class PesterWindow(MovingWindow): self.config.set('blink', blinksetting) # toast notifications self.tm.setEnabled(self.optionmenu.notifycheck.isChecked()) - self.tm.setCurrentType(str(self.optionmenu.notifyOptions.currentText())) + self.tm.setCurrentType(unicode(self.optionmenu.notifyOptions.currentText())) notifysetting = 0 if self.optionmenu.notifySigninCheck.isChecked(): notifysetting |= self.config.SIGNIN @@ -2364,7 +2373,7 @@ class PesterWindow(MovingWindow): newmodes = self.optionmenu.modechange.text() if newmodes: self.setChannelMode.emit(self.profile().handle, newmodes, "") - except Exception as e: + except Exception, e: logging.error(e) finally: self.optionmenu = None @@ -2391,13 +2400,13 @@ class PesterWindow(MovingWindow): @QtCore.pyqtSlot() def themeSelected(self, override=False): if not override: - themename = str(self.optionmenu.themeBox.currentText()) + themename = unicode(self.optionmenu.themeBox.currentText()) else: themename = override if override or themename != self.theme.name: try: self.changeTheme(pesterTheme(themename)) - except ValueError as e: + except ValueError, e: themeWarning = QtWidgets.QMessageBox(self) themeWarning.setText("Theme Error: %s" % (e)) themeWarning.exec_() @@ -2413,14 +2422,14 @@ class PesterWindow(MovingWindow): def profileSelected(self): if self.chooseprofile.profileBox and \ self.chooseprofile.profileBox.currentIndex() > 0: - handle = str(self.chooseprofile.profileBox.currentText()) + handle = unicode(self.chooseprofile.profileBox.currentText()) if handle == self.profile().handle: self.chooseprofile = None return self.userprofile = userProfile(handle) self.changeTheme(self.userprofile.getTheme()) else: - handle = str(self.chooseprofile.chumHandle.text()) + handle = unicode(self.chooseprofile.chumHandle.text()) if handle == self.profile().handle: self.chooseprofile = None return @@ -2519,7 +2528,7 @@ class PesterWindow(MovingWindow): if not hasattr(self, 'chooseprofile'): self.chooseprofile = None if not self.chooseprofile: - h = str(handle) + h = unicode(handle) self.changeProfile(collision=h) @QtCore.pyqtSlot('QString') def myHandleChanged(self, handle): @@ -2607,6 +2616,15 @@ class MainProgram(QtCore.QObject): options = self.oppts(sys.argv[1:]) + if pygame and pygame.mixer: + # we could set the frequency higher but i love how cheesy it sounds + try: + pygame.mixer.init() + pygame.mixer.init() + except pygame.error, e: + print "Warning: No sound! %s" % (e) + else: + print "Warning: No sound!" self.widget = PesterWindow(options, app=self.app) self.widget.show() @@ -2664,7 +2682,7 @@ class MainProgram(QtCore.QObject): @QtCore.pyqtSlot() def runUpdateSlot(self): - q = queue.Queue(1) + q = Queue.Queue(1) s = threading.Thread(target=version.updateCheck, args=(q,)) w = threading.Thread(target=self.showUpdate, args=(q,)) w.start() @@ -2795,7 +2813,7 @@ Click this message to never see this again.") for c in self.widget.tabmemo.convos: self.irc.joinChannel(c) else: - for c in list(self.widget.memos.values()): + for c in self.widget.memos.values(): self.irc.joinChannel(c.channel) return True diff --git a/profile.py b/profile.py index 6d5ef1e..64b6d28 100644 --- a/profile.py +++ b/profile.py @@ -41,17 +41,17 @@ class PesterLog(object): if not self.parent.config.logPesters() & self.parent.config.LOG: return if not self.parent.config.logPesters() & self.parent.config.STAMP: time = "" - if str(handle).upper() == "NICKSERV": return + if unicode(handle).upper() == "NICKSERV": return #watch out for illegal characters handle = re.sub(r'[<>:"/\\|?*]', "_", handle) bbcodemsg = time + convertTags(msg, "bbcode") html = time + convertTags(msg, "html")+"
" msg = time +convertTags(msg, "text") modes = {"bbcode": bbcodemsg, "html": html, "text": msg} - if handle not in self.convos: + if not self.convos.has_key(handle): time = datetime.now().strftime("%Y-%m-%d.%H.%M") self.convos[handle] = {} - for (format, t) in modes.items(): + for (format, t) in modes.iteritems(): if not os.path.exists("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format)): os.makedirs("%s/%s/%s/%s" % (self.logpath, self.handle, handle, format)) try: @@ -63,7 +63,7 @@ class PesterLog(object): errmsg.show() continue self.convos[handle][format] = fp - for (format, t) in modes.items(): + for (format, t) in modes.iteritems(): f = self.convos[handle][format] if platform.system() == "Windows": f.write(t+"\r\n") @@ -71,14 +71,14 @@ class PesterLog(object): f.write(t+"\r\n") f.flush() def finish(self, handle): - if handle not in self.convos: + if not self.convos.has_key(handle): return - for f in list(self.convos[handle].values()): + for f in self.convos[handle].values(): f.close() del self.convos[handle] def close(self): - for h in list(self.convos.keys()): - for f in list(self.convos[h].values()): + for h in self.convos.keys(): + for f in self.convos[h].values(): f.close() class userConfig(object): @@ -100,7 +100,7 @@ class userConfig(object): fp = open(self.filename) self.config = json.load(fp) fp.close() - if "defaultprofile" in self.config: + if self.config.has_key("defaultprofile"): self.userprofile = userProfile(self.config["defaultprofile"]) else: self.userprofile = None @@ -125,7 +125,7 @@ class userConfig(object): fp.close() def chums(self): - if 'chums' not in self.config: + if not self.config.has_key('chums'): self.set("chums", []) return self.config.get('chums', []) def setChums(self, newchums): @@ -148,19 +148,19 @@ class userConfig(object): def tabs(self): return self.config.get("tabs", True) def tabMemos(self): - if 'tabmemos' not in self.config: + if not self.config.has_key('tabmemos'): self.set("tabmemos", self.tabs()) return self.config.get("tabmemos", True) def showTimeStamps(self): - if 'showTimeStamps' not in self.config: + if not self.config.has_key('showTimeStamps'): self.set("showTimeStamps", True) return self.config.get('showTimeStamps', True) def time12Format(self): - if 'time12Format' not in self.config: + if not self.config.has_key('time12Format'): self.set("time12Format", True) return self.config.get('time12Format', True) def showSeconds(self): - if 'showSeconds' not in self.config: + if not self.config.has_key('showSeconds'): self.set("showSeconds", False) return self.config.get('showSeconds', False) def sortMethod(self): @@ -174,11 +174,11 @@ class userConfig(object): return g[1] return True def showEmptyGroups(self): - if 'emptyGroups' not in self.config: + if not self.config.has_key('emptyGroups'): self.set("emptyGroups", False) return self.config.get('emptyGroups', False) def showOnlineNumbers(self): - if 'onlineNumbers' not in self.config: + if not self.config.has_key('onlineNumbers'): self.set("onlineNumbers", False) return self.config.get('onlineNumbers', False) def logPesters(self): @@ -238,7 +238,7 @@ class userConfig(object): newchums = [c for c in self.config['chums'] if c != handle] self.set("chums", newchums) def getBlocklist(self): - if 'block' not in self.config: + if not self.config.has_key('block'): self.set('block', []) return self.config['block'] def addBlocklist(self, handle): @@ -251,7 +251,7 @@ class userConfig(object): l.pop(l.index(handle)) self.set('block', l) def getGroups(self): - if 'groups' not in self.groups: + if not self.groups.has_key('groups'): self.saveGroups([["Chums", True]]) return self.groups.get('groups', [["Chums", True]]) def addGroup(self, group, open=True): @@ -285,7 +285,7 @@ class userConfig(object): self.groups['groups'] = groups try: jsonoutput = json.dumps(self.groups) - except ValueError as e: + except ValueError, e: raise e fp = open("%s/groups.js" % (self.logpath), 'w') fp.write(jsonoutput) @@ -300,7 +300,7 @@ class userConfig(object): return self.parent.portOverride return self.config.get('port', '6667') def soundOn(self): - if 'soundon' not in self.config: + if not self.config.has_key('soundon'): self.set('soundon', True) return self.config['soundon'] def chatSound(self): @@ -319,7 +319,7 @@ class userConfig(object): self.config[item] = setting try: jsonoutput = json.dumps(self.config) - except ValueError as e: + except ValueError, e: raise e fp = open(self.filename, 'w') fp.write(jsonoutput) @@ -356,7 +356,7 @@ class userProfile(object): if type(user) is PesterProfile: self.chat = user self.userprofile = {"handle":user.handle, - "color": str(user.color.name()), + "color": unicode(user.color.name()), "quirks": [], "theme": "pesterchum"} self.theme = pesterTheme("pesterchum") @@ -377,7 +377,7 @@ class userProfile(object): fp.close() try: self.theme = pesterTheme(self.userprofile["theme"]) - except ValueError as e: + except ValueError, e: self.theme = pesterTheme("pesterchum") self.lastmood = self.userprofile.get('lastmood', self.theme["main/defaultmood"]) self.chat = PesterProfile(self.userprofile["handle"], @@ -402,7 +402,7 @@ class userProfile(object): try: with open(_datadir+"passwd.js") as fp: self.passwd = json.load(fp) - except Exception as e: + except Exception, e: self.passwd = {} self.autoidentify = False self.nickservpass = "" @@ -418,7 +418,7 @@ class userProfile(object): self.save() def setColor(self, color): self.chat.color = color - self.userprofile["color"] = str(color.name()) + self.userprofile["color"] = unicode(color.name()) self.save() def setQuirks(self, quirks): self.quirks = quirks @@ -436,7 +436,7 @@ class userProfile(object): try: for (i,m) in enumerate(mentions): re.compile(m) - except re.error as e: + except re.error, e: logging.error("#%s Not a valid regular expression: %s" % (i, e)) else: self.mentions = mentions @@ -479,19 +479,19 @@ class userProfile(object): return try: jsonoutput = json.dumps(self.userprofile) - except ValueError as e: + except ValueError, e: raise e fp = open("%s/%s.js" % (self.profiledir, handle), 'w') fp.write(jsonoutput) fp.close() def saveNickServPass(self): # remove profiles with no passwords - for h,t in list(self.passwd.items()): + for h,t in self.passwd.items(): if "auto" not in t or "pw" not in t or t["pw"] == "": del self.passwd[h] try: jsonoutput = json.dumps(self.passwd, indent=4) - except ValueError as e: + except ValueError, e: raise e with open(_datadir+"passwd.js", 'w') as fp: fp.write(jsonoutput) @@ -526,7 +526,7 @@ class PesterProfileDB(dict): fp.close() u = [] - for (handle, c) in chumdict.items(): + for (handle, c) in chumdict.iteritems(): options = dict() if 'group' in c: options['group'] = c['group'] @@ -543,39 +543,39 @@ class PesterProfileDB(dict): def save(self): try: fp = open("%s/chums.js" % (self.logpath), 'w') - chumdict = dict([p.plaindict() for p in self.values()]) + chumdict = dict([p.plaindict() for p in self.itervalues()]) json.dump(chumdict, fp) fp.close() - except Exception as e: + except Exception, e: raise e def getColor(self, handle, default=None): - if handle not in self: + if not self.has_key(handle): return default else: return self[handle].color def setColor(self, handle, color): - if handle in self: + if self.has_key(handle): self[handle].color = color else: self[handle] = PesterProfile(handle, color) def getGroup(self, handle, default="Chums"): - if handle not in self: + if not self.has_key(handle): return default else: return self[handle].group def setGroup(self, handle, theGroup): - if handle in self: + if self.has_key(handle): self[handle].group = theGroup else: self[handle] = PesterProfile(handle, group=theGroup) self.save() def getNotes(self, handle, default=""): - if handle not in self: + if not self.has_key(handle): return default else: return self[handle].notes def setNotes(self, handle, notes): - if handle in self: + if self.has_key(handle): self[handle].notes = notes else: self[handle] = PesterProfile(handle, notes=notes) @@ -604,7 +604,7 @@ class pesterTheme(dict): except IOError: theme = json.loads("{}") self.update(theme) - if "inherits" in self: + if self.has_key("inherits"): self.inheritedTheme = pesterTheme(self["inherits"]) if not default: self.defaultTheme = pesterTheme("pesterchum", default=True) @@ -612,7 +612,7 @@ class pesterTheme(dict): keys = key.split("/") try: v = dict.__getitem__(self, keys.pop(0)) - except KeyError as e: + except KeyError, e: if hasattr(self, 'inheritedTheme'): return self.inheritedTheme[key] if hasattr(self, 'defaultTheme'): @@ -622,7 +622,7 @@ class pesterTheme(dict): for k in keys: try: v = v[k] - except KeyError as e: + except KeyError, e: if hasattr(self, 'inheritedTheme'): return self.inheritedTheme[key] if hasattr(self, 'defaultTheme'): @@ -631,8 +631,8 @@ class pesterTheme(dict): raise e return v def pathHook(self, d): - for (k, v) in d.items(): - if type(v) is str: + for (k, v) in d.iteritems(): + if type(v) is unicode: s = Template(v) d[k] = s.safe_substitute(path=self.path) return d @@ -658,6 +658,6 @@ class pesterTheme(dict): return False if v is None else True except KeyError: if hasattr(self, 'inheritedTheme'): - return key in self.inheritedTheme + return self.inheritedTheme.has_key(key) else: return False diff --git a/pyquirks.py b/pyquirks.py index e275cc5..f4a5b37 100644 --- a/pyquirks.py +++ b/pyquirks.py @@ -12,20 +12,20 @@ class PythonQuirks(ScriptQuirks): def modHas(self, module, attr): if attr == 'commands': variables = vars(module) - for name, obj in variables.items(): + for name, obj in variables.iteritems(): if self.modHas(obj, 'command'): return True return hasattr(module, attr) def register(self, module): variables = vars(module) - for name, obj in variables.items(): + for name, obj in variables.iteritems(): if self.modHas(obj, 'command'): try: - if not isinstance(obj("test"), str): + if not isinstance(obj("test"), basestring): raise Exception except: - print("Quirk malformed: %s" % (obj.command)) + print "Quirk malformed: %s" % (obj.command) msgbox = QtWidgets.QMessageBox() msgbox.setWindowTitle("Error!") msgbox.setText("Quirk malformed: %s" % (obj.command)) diff --git a/quirks.py b/quirks.py index 2863918..7499abe 100644 --- a/quirks.py +++ b/quirks.py @@ -20,7 +20,7 @@ class ScriptQuirks(object): self.last = self.quirks.copy() self.quirks.clear() for script in self.scripts: - print(script.getExtension()) + print script.getExtension() script.load() #print script.quirks for q in script.quirks: @@ -31,9 +31,9 @@ class ScriptQuirks(object): del self.quirks[k] #print self.quirks if self.quirks: - print('Registered quirks:', '(), '.join(self.quirks) + "()") + print 'Registered quirks:', '(), '.join(self.quirks) + "()" else: - print("Warning: Couldn't find any script quirks") + print "Warning: Couldn't find any script quirks" def add(self, script): self.scripts.append(script) @@ -64,8 +64,8 @@ class ScriptQuirks(object): module = self.loadModule(name, filename) if module is None: continue - except Exception as e: - print("Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e)) + except Exception, e: + print "Error loading %s: %s (in quirks.py)" % (os.path.basename(name), e) msgbox = QtWidgets.QMessageBox() msgbox.setWindowTitle("Error!") msgbox.setText("Error loading %s: %s (in quirks.py)" % (os.path.basename(filename), e)) diff --git a/randomer.py b/randomer.py index 4df2b04..af60239 100644 --- a/randomer.py +++ b/randomer.py @@ -63,6 +63,6 @@ class RandomHandler(QtCore.QObject): msgbox.setInformativeText("Try again later :(") msgbox.exec_() return - name = str(l[1]) - print(name) + name = unicode(l[1]) + print name self.mainwindow.newConversation(name) diff --git a/toast.py b/toast.py index 2d89cad..c040d62 100644 --- a/toast.py +++ b/toast.py @@ -4,29 +4,27 @@ import time, os import ostools from PyQt5 import QtGui, QtCore, QtWidgets -import logging - try: import pynotify except: pynotify = None -class DefaultToast(QtWidgets.QWidget): +class DefaultToast(object): def __init__(self, parent, **kwds): - super().__init__(parent) + super(DefaultToast, self).__init__(parent, **kwds) self.machine = kwds.get('machine') self.title = kwds.get('title') self.msg = kwds.get('msg') self.icon = kwds.get('icon') def show(self): - print(self.title, self.msg, self.icon) + print self.title, self.msg, self.icon self.done() def done(self): t = self.machine.toasts[0] if t.title == self.title and t.msg == self.msg and t.icon == self.icon: self.machine.toasts.pop(0) self.machine.displaying = False - print("Done") + print "Done" class ToastMachine(object): class __Toast__(object): @@ -75,7 +73,7 @@ class ToastMachine(object): def realShow(self): self.machine.displaying = True t = None - for (k,v) in self.machine.types.items(): + for (k,v) in self.machine.types.iteritems(): if self.machine.type == k: try: args = inspect.getargspec(v.__init__).args @@ -145,15 +143,15 @@ class ToastMachine(object): if type in self.types: if type == "libnotify": if not pynotify or not pynotify.init("ToastMachine"): - print("Problem initilizing pynotify") + print "Problem initilizing pynotify" return #self.type = type = "default" elif type == "twmn": from libs import pytwmn try: pytwmn.init() - except pytwmn.ERROR as e: - print("Problem initilizing pytwmn: " + str(e)) + except pytwmn.ERROR, e: + print "Problem initilizing pytwmn: " + str(e) return #self.type = type = "default" self.type = type @@ -179,11 +177,9 @@ class ToastMachine(object): self.showNext() -class PesterToast(DefaultToast): +class PesterToast(QtWidgets.QWidget, DefaultToast): def __init__(self, machine, title, msg, icon, time=3000, parent=None): - logging.info(isinstance(parent, QtWidgets.QWidget)) - kwds = dict(machine=machine, title=title, msg=msg, icon=icon) - super().__init__(parent, **kwds) + super(PesterToast, self).__init__(self, parent, machine=machine, title=title, msg=msg, icon=icon) self.machine = machine self.time = time @@ -214,6 +210,7 @@ class PesterToast(DefaultToast): self.icon.pixmap().fill(QtGui.QColor(0,0,0,0)) layout_0 = QtWidgets.QVBoxLayout() + layout_0.setMargin(0) layout_0.setContentsMargins(0, 0, 0, 0) if self.icon: @@ -240,7 +237,7 @@ class PesterToast(DefaultToast): self.msg.setStyleSheet(self.parent().theme["toasts/content/style"]) self.layout().setSpacing(0) - self.msg.setText(PesterToast.wrapText(self.msg.font(), str(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"])) + self.msg.setText(PesterToast.wrapText(self.msg.font(), unicode(self.msg.text()), self.parent().theme["toasts/width"], self.parent().theme["toasts/content/style"])) p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight() o = QtWidgets.QApplication.desktop().screenGeometry(self).bottomRight() @@ -258,8 +255,8 @@ class PesterToast(DefaultToast): def done(self): QtWidgets.QWidget.hide(self) t = self.machine.toasts[0] - if t.title == str(self.title.text()) and \ - t.msg == str(self.content): + if t.title == unicode(self.title.text()) and \ + t.msg == unicode(self.content): self.machine.toasts.pop(0) self.machine.displaying = False if self.machine.on: @@ -269,7 +266,7 @@ class PesterToast(DefaultToast): @QtCore.pyqtSlot() def reverseTrigger(self): if self.time >= 0: - QtCore.QTimer.singleShot(self.time, self.reverseStart) + QtCore.QTimer.singleShot(self.time, self, QtCore.SLOT('reverseStart()')) @QtCore.pyqtSlot() def reverseStart(self): @@ -286,7 +283,7 @@ class PesterToast(DefaultToast): def updateBottomLeftAnimation(self, value): p = QtWidgets.QApplication.desktop().availableGeometry(self).bottomRight() val = float(self.height())/100 - self.move(p.x()-self.width(), p.y() - (value * val) +1) + self.move(p.x()-self.width(), p.y() - (value.toInt()[0] * val) +1) self.layout().setSpacing(0) QtWidgets.QWidget.show(self) @@ -352,7 +349,7 @@ class PesterToast(DefaultToast): break if (metric.width(text[:lastspace]) > maxwidth) or \ len(text[:lastspace]) < 1: - for i in range(len(text)): + for i in xrange(len(text)): if metric.width(text[:i]) > maxwidth: lastspace = i-1 break diff --git a/updatecheck.py b/updatecheck.py index bba4302..b5dfd5b 100644 --- a/updatecheck.py +++ b/updatecheck.py @@ -34,20 +34,19 @@ class MSPAChecker(QtWidgets.QWidget): raise if os.path.exists("status_old.pkl"): os.remove("status_old.pkl") - except Exception as e: - print(e) + except Exception, e: + print e msg = QtWidgets.QMessageBox(self) msg.setText("Problems writing save file.") msg.show() @QtCore.pyqtSlot() def check_site_wrapper(self): - return # turn off MSPA check; python3 doesnt like it if not self.mainwindow.config.checkMSPA(): return if self.lock: return - print("Checking MSPA updates...") + print "Checking MSPA updates..." s = threading.Thread(target=self.check_site) s.start() @@ -89,7 +88,7 @@ class MSPAChecker(QtWidgets.QWidget): @QtCore.pyqtSlot() def visit_site(self): - print(self.status['last_visited']['link']) + print self.status['last_visited']['link'] QtGui.QDesktopServices.openUrl(QtCore.QUrl(self.status['last_visited']['link'], QtCore.QUrl.TolerantMode)) if self.status['last_seen']['pubdate'] > self.status['last_visited']['pubdate']: #Visited for the first time. Untrip the icon and remember that we saw it. diff --git a/version.py b/version.py index b9acc98..59b0d76 100644 --- a/version.py +++ b/version.py @@ -1,4 +1,4 @@ -import urllib.request, urllib.parse, urllib.error +import urllib import re import time try: @@ -67,31 +67,31 @@ def lexVersion(short=False): # Naughty I know, but it lets me grab it from the bash script. if __name__ == "__main__": - print(lexVersion()) + print lexVersion() def verStrToNum(ver): w = re.match("(\d+\.?\d+)\.(\d+)-?([A-Za-z]{0,2})\.?(\d*):(\S+)", ver) if not w: - print("Update check Failure: 3"); return + print "Update check Failure: 3"; return full = ver[:ver.find(":")] return full,w.group(1),w.group(2),w.group(3),w.group(4),w.group(5) def updateCheck(q): time.sleep(3) - data = urllib.parse.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE}) + data = urllib.urlencode({"type" : USER_TYPE, "os" : OS_TYPE, "install" : INSTALL_TYPE}) try: - f = urllib.request.urlopen("http://distantsphere.com/pesterchum.php?" + data) + f = urllib.urlopen("http://distantsphere.com/pesterchum.php?" + data) except: - print("Update check Failure: 1"); return q.put((False,1)) + print "Update check Failure: 1"; return q.put((False,1)) newest = f.read() f.close() if not newest or newest[0] == "<": - print("Update check Failure: 2"); return q.put((False,2)) + print "Update check Failure: 2"; return q.put((False,2)) try: (full, major, minor, status, revision, url) = verStrToNum(newest) except TypeError: return q.put((False,3)) - print(full) + print full if major <= _pcMajor: if minor <= _pcMinor: if status: @@ -102,7 +102,7 @@ def updateCheck(q): if not _pcStatus: if revision <= _pcRevision: return q.put((False,0)) - print("A new version of Pesterchum is avaliable!") + print "A new version of Pesterchum is avaliable!" q.put((full,url)) @@ -128,9 +128,9 @@ def copyUpdate(path): def updateExtract(url, extension): if extension: fn = "update" + extension - urllib.request.urlretrieve(url, fn) + urllib.urlretrieve(url, fn) else: - fn = urllib.request.urlretrieve(url)[0] + fn = urllib.urlretrieve(url)[0] if tarfile and tarfile.is_tarfile(fn): extension = ".tar.gz" elif zipfile.is_zipfile(fn): @@ -144,17 +144,17 @@ def updateExtract(url, extension): except: pass - print(url, fn, extension) + print url, fn, extension if extension == ".exe": pass elif extension == ".zip" or extension.startswith(".tar"): if extension == ".zip": from zipfile import is_zipfile as is_updatefile, ZipFile as openupdate - print("Opening .zip") + print "Opening .zip" elif tarfile and extension.startswith(".tar"): from tarfile import is_tarfile as is_updatefile, open as openupdate - print("Opening .tar") + print "Opening .tar" else: return