Games Workshop FAQ Grabber (Python Script)

July 22, 2010 | Reading time: 1 minutes

I’m nerding out on ya, sorry. Programmers and Script Monkeys only, as I’m not going to explain how to use this.

I’ve created a Python script that grabs all the latest 40K and Fantasy FAQ’s/PDF’s

Here’s the script:

#!/usr/bin/python

import urllib
import sys,os

sGWURL = "http://www.games-workshop.com"
sWarhammerFAQURL = "http://www.games-workshop.com/gws/content/article.jsp?categoryId=1000018&pIndex=1&aId=3000006&start=2"
s40KFAQURL = "http://www.games-workshop.com/gws/content/article.jsp?catId=cat440134a&categoryId=1000018§ion=&pIndex=1&aId=3400019&start=2"
sLocalFantasyFAQPath = "./" # I Normally just use the full filepath to my Dropbox Folder on my Ubuntu box
sLocal40KFAQPath = "./" # I Normally just use the full filepath to my Dropbox Folder on my Ubuntu box

def GetData(sURL):
        oPage = urllib.urlopen(sURL)
        sPageData = oPage.read()
        return sPageData

def FilterOutFAQLines(sData):
        FAQs = list()
        aLines = sData.splitlines()
        for sLine in aLines:
                if sLine.lower().startswith("<a href"):
                        if sLine.lower().find(".pdf") > 0:
                                FAQs.append(sLine)
        return FAQs

def downloadFile(url,localfilename):
        webFile = urllib.urlopen(url)
        #olocalFile = open(url.split('/')[-1], 'w')
        localFile = open(localfilename, 'w')
        localFile.write(webFile.read())
        webFile.close()
        localFile.close()

def GetFile(sHREFLine, sDir):
        global sGWURL
        sURL = sGWURL + sHREFLine[sHREFLine.find('"') + 1:sHREFLine.find('"', 10)]
        sFileName = sHREFLine[sHREFLine.find('>') + 1:sHREFLine.find('<', 10)]
        downloadFile(sURL, sDir + sFileName)
        return sFileName

def DeleteFolderContents(folder):
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception, e:
                print e

aWarhammerFAQs = FilterOutFAQLines(GetData(sWarhammerFAQURL))
a40KFAQs =  FilterOutFAQLines(GetData(s40KFAQURL))

DeleteFolderContents(sLocalFantasyFAQPath)
for sLine in aWarhammerFAQs:
        sFile = GetFile(sLine, sLocalFantasyFAQPath)
DeleteFolderContents( sLocal40KFAQPath )
for sLine in a40KFAQs:
        sFile = GetFile(sLine, sLocal40KFAQPath )

Feel free to download it here in case your copy/paste doesn’t work.

This won’t be very useful to non-programmer types, but it might save someone an hour or so if they wanted to do the same thing.

Comments

comments powered by Disqus