User:Matt Crypto/RandomArticles

From Wikipedia, the free encyclopedia

While Wikipedia has a Random page feature, the pages are selected uniformly randomly from the database. As an alternative, I wrote a script to choose pages randomly based on their hit counts for a month; such a set might give a more representative example of how Wikipedia looks to visitors. The hit data for, say, September 2004 can be found here (warning: very large file). Below is an example from the hits so far this month (to 22nd September 2004). If you would like a set, just send me a message and tell me a Wikipedia page, and I'll run the script for you and paste in the output. — Matt 15:06, 21 Sep 2004 (UTC)

100 randomly-selected articles (weighted by popularity)[edit]

Script[edit]

import re
from random import *

logFile = "/tmp/url_200409.html"
maxEntries = None # 10000
numberOfArticles = 100

r1 = re.compile(r'^(\d*)\s*([0-9.]*)%\s*([0-9]*)\s*([0-9.]*)%\s*/wiki/(\S*)$')

class ArticlePicker:
    def __init__(self, logFile, maxEntries = False):
        self.logFile = logFile
        self.hitList = []
        self.count = 0
        self.maxEntries = maxEntries

    def readLogFile(self):
        F = open(self.logFile)
        count = 0
        self.hitSum = 0
        for l in F:
            if self.maxEntries and count > self.maxEntries:
                break           
            try: 
                hits, name = self.parseLine(l)
            except ValueError:
                continue

            count = count + 1
            self.hitList.append((hits,name))
            self.hitSum += hits
            
        self.count = count
        F.close()
        self.hitList.sort()
        self.hitList.reverse()
        
    def parseLine(self, line):
        l = line.strip()
        
        m = r1.match(l)
        if m == None: raise ValueError, "No matches found"
        (hits, t1, t2, t3, name) = r1.match(l).groups()
        self.filterOut(hits, name)
        spaceName = re.sub('_', ' ', name)

        return int(hits), spaceName

    def filterOut(self, hits, name):
        if name == "": raise ValueError                    # Exclude blank
        if re.match(r'^\w*:', name): raise ValueError      # Exclude namespaces
        if re.match(r'Main_Page', name):  raise ValueError # Exclude main page

        # Exclude popular oddities
        if re.match(r'_vti_bin/owssvr.dl|MSOffice/cltreq.asp', name): raise ValueError


    def selectRandomly(self, N = 1):
        rHits = [random() * self.hitSum for i in range(N)]
        outputs = [None] * N
        numberOfOutputs = 0
        totalSoFar = 0
        for hits, name in self.hitList:
            totalSoFar += hits
            for index in range(N):
                if not outputs[index] and totalSoFar >= rHits[index]:
                    outputs[index] = hits, name
                    numberOfOutputs += 1
                    if numberOfOutputs == N: return outputs
        return outputs

# Dump the articles
H = ArticlePicker(logFile, maxEntries)
H.readLogFile()
randomArticles =  H.selectRandomly(numberOfArticles)
print "==%d randomly-selected articles (weighted by popularity)==" % numberOfArticles
for hits, name in randomArticles:
    print "* %s — (%d hits)" % (name, hits)