rc-buggy packages sorted by popularity

After doing a round of tag reviews so that we release lenny with the latest tag contributions, I still had a bit of spare time. Since people are urging to fix the last bugs I thought I could contribute a list of RC-buggy packages sorted by popularity:

#!/usr/bin/python

import urllib2
import sys, re #, gzip

PKGRE = re.compile(r'<strong>Package: </strong><a href="[^"]+" name="([^"]+)">')
POPCONRE = re.compile(r'^(\d+)\s+(\S+)')
PKGMAPRE = re.compile(r'^([^(]+)\(([^)]+)')
COMMASPLIT = re.compile(r'\s*,\s*')

# Allow to download other package views
if len(sys.argv) > 1:
    url = sys.argv[1]
else:
    url = "http://bts.turmzimmer.net/details.php"

def read_packages(url):
    "Download the list of rc-buggy packages"
    fd = urllib2.urlopen(url)
    for line in fd:
        mo = PKGRE.search(line)
        if mo:
            yield mo.group(1)

# FIXME: this is a deprecated data source, but at the moment it is just what
#        is needed
def read_bintosrc(url = "http://qa.debian.org/data/ddpo/results/ddpo_packages"):
    "Download the binary to source package map"
    fd = urllib2.urlopen(url)
    for line in fd:
        mo = PKGMAPRE.search(line)
        if mo:
            src = mo.group(1)
            bin = COMMASPLIT.split(mo.group(2))
            for p in bin:
                yield p, src

def read_popcon(url = "http://popcon.debian.org/by_inst"):
    "Download popcon information"
    fd = urllib2.urlopen(url)
    # It doesn't work on urllib2 objects because it wants tell()
    #fd = gzip.GzipFile(fileobj=fd, filename=url, mode="rb")
    for line in fd:
        mo = POPCONRE.match(line)
        if mo:
            yield mo.group(2), int(mo.group(1))

print "Reading mapping of binary packages to source packages..."
bin2src = dict(read_bintosrc())
print "Reading popcon scores..."
# We actually want the score of source packages, and for it we use the score of
# its binary package with the highest score
scores = dict()
for pkg, score in read_popcon():
    pkg = bin2src.get(pkg, pkg)
    if pkg not in scores:
        scores[pkg] = score
#scores = dict(read_popcon("http://popcon.debian.org/by_inst.gz"))
print "Reading list of rc-buggy packages..."
# The bug page has a mix of source and binary packages: convert all to source
# packages
packages = sorted(read_packages(url), key=lambda x:scores.get(bin2src.get(x,x), 0))
for p in packages:
    print scores.get(bin2src.get(p,p), 0), p

This can be run on people.debian.org as ~enrico/popzimmer (0 is the rank of packages that for some reason cannot be mapped to any rank):

$ ~enrico/popzimmer
Reading mapping of binary packages to source packages...
Reading popcon scores...
Reading list of rc-buggy packages...
0 roxen-fonts-iso8859-2
2 dpkg
11 pam
61 glibc
61 libc6
123 initramfs-tools
149 bind9
150 grub
180 portmap
213 nfs-common
279 libgl1-mesa-dri
298 libsnmp-base
298 snmpd
309 avahi
397 mysql-dfsg-5.0
408 xserver-xorg-input-wacom
409 xserver-xorg-video-vesa
428 docbook-xml
499 openoffice.org-core
567 xine-lib
600 evolution-data-server
669 eog
671 xserver-xorg-video-intel
691 epiphany-browser
691 epiphany-gecko
695 kdelibs4c2a
698 uswsusp
754 guile-1.6
959 gpm
964 cups
990 kdebase
1002 ghostscript
1069 giflib
1076 xulrunner-1.9
1177 libapache2-mod-perl2
1217 vlc-nox
1229 kernel-package
1311 kdeutils-dev
1520 libparted1.8-10
1767 wireshark
1785 selinux-policy-default
1908 dia
1943 htop
1983 boost
2140 cantlr
2145 transmission
2159 emacs22
2312 swfdec-mozilla
2449 eclipse
2449 libswt3.2-gtk-java
2728 lilo
2898 libnss-ldap
3013 java-access-bridge
3051 blender
3099 libtemplate-perl
3499 fglrx-atieventsd
3602 virtualbox-ose
3619 jfsutils
3806 anjuta
3962 kbuild
4076 libgnustep-gui0.14
4222 libwebkit-1.0-1
4332 rosegarden
4412 matplotlib
4412 python-matplotlib
4950 mail-notification-evolution
5195 nut-cgi
5273 sbcl
5538 ruby1.9
5679 kmymoney2
6076 websvn
6147 miro
6264 gnat-4.1
6431 gallery2
6478 gnome-chess
6512 joystick
6862 istanbul
6937 wordpress
7476 luatex
7635 csound
7635 python-csoundac
8257 libpam-mount
8471 wmcpuload
8631 nbsmtp
8642 oxine
8774 axiom
8949 libengine-pkcs11-openssl
9052 clamav-getfiles
9221 open-iscsi
9262 ptex-bin
9701 egroupware-core
10012 debian-cd
10127 libjogl-java
10453 libjson-xs-perl
10698 gcjwebplugin
10736 request-tracker3.6
11294 python-extclass
11518 rkward
11600 alevt
12929 otrs2
13205 bindgraph
13956 icecc-monitor
14223 netmaze
14505 cdebconf
14526 mumble
14687 fnonlinear
14724 recite
14789 jbidwatcher
15766 libnss-ldapd
15812 glpi
15905 lustre-source
15982 win32-loader
16232 ampache
17168 libghc6-missingh-dev
17311 acl2
17830 apertium
18462 emacspeak
18469 glassfish
19257 isight-firmware-tools
19719 libnagios-object-perl
20144 libwx-perl
20465 dns2tcp
21614 python-kinterbasdb-dbg
21690 hf
21902 libcobra-java
22365 tomcat6
22573 libphp-snoopy
23501 coco-java
23926 boxbackup-server
24009 libghc6-wash-dev
26323 libdesktop-notify-perl
26439 opendb
28426 adolc
28795 xorp
29986 libgdata-java
31262 libjboss-serialization-java
31472 pixelpost
31658 cournol
32939 mediamate
34365 gforge-plugin-scmcvs
34659 libgearman-client-async-perl
42233 libhamcrest-java
48076 mahara
55303 mxallowd
85881 roxen-fonts-iso8859-1

So, if you don't know from what package to start fixing bugs, "Begin at the beginning,", the King said, very gravely, "and go on till you come to the end: then stop."