# -*- coding: utf-8 -*- import zipfile, string class user: def __init__(self): self.userid = '' self.joindate = '' self.name = '' class post: def __init__(self): self.postid = '' self.userid = '' self.datestamp = '' self.category = '' self.comments = 0 self.favorites = 0 self.deleted = '' self.reason = '' self.title = '' self.tags = [] class tag: def __init__(self): self.tagid = '' self.linkid = '' self.linkdate = '' self.tagname = '' class comment: def __init__(self): self.commentid = '' self.postid = '' self.userid = '' self.datestamp = '' self.faves = '' self.bestanswer = '' self.length = 0 def loaddata(thezipfile, filename): print "Loading %s..." % filename datafile = thezipfile.open(filename) timestamp = datafile.readline() headings = datafile.readline() datadict = {} for line in datafile: strippedline = str.strip(line) linelist = str.split(strippedline,"\t") #there is probably a better way of doing this bit, but this will work: if "usernames" in filename: userobj = user() userobj.userid = linelist[0] userobj.joindate = linelist[1] userobj.name = str.strip(linelist[2]) datadict[linelist[0]] = userobj if "postdata" in filename: postobj = post() postobj.postid = linelist[0] postobj.userid = linelist[1] postobj.datestamp = linelist[2] postobj.category = linelist[3] postobj.comments = int(linelist[4]) postobj.favorites = int(linelist[5]) postobj.deleted = linelist[6] if len(linelist) > 7: postobj.reason = linelist[7] datadict[linelist[0]] = postobj if "posttitles" in filename: if len(linelist)>1: datadict[linelist[0]] = linelist[1] else: datadict[linelist[0]] = "" if "tagdata" in filename: tagobj = tag() tagobj.tagid = linelist[0] tagobj.linkid = linelist[1] tagobj.linkdate = linelist[2] tagobj.tagname = linelist[3] datadict[linelist[0]] = tagobj if "commentdata" in filename: #this takes forever. commentobj = comment() commentobj.commentid = linelist[0] commentobj.postid = linelist[1] commentobj.userid = linelist[2] commentobj.datestamp = linelist[3] commentobj.faves = int(linelist[4]) commentobj.bestanswer = linelist[5] datadict[linelist[0]] = commentobj return datadict def assignposttitles(postsdict, titlesdict): for post in postsdict: postsdict[post].title=titlesdict[postsdict[post].postid] def assigntags(postsdict, tagsdict): for tag in tagsdict: thistag = tagsdict[tag] if thistag.linkid in postsdict: #needed because askmepost 56056 no longer exists but still has tags--some kind of data loss back in the day? postsdict[thistag.linkid].tags.append(thistag) thezipfile = zipfile.ZipFile("/home/jeremy/Desktop/infodump/infodump-all.zip", "r") usersdict = loaddata(thezipfile,"usernames.txt") #askme askmeposts = loaddata(thezipfile, "postdata_askme.txt") askmeposttitles = loaddata(thezipfile, "posttitles_askme.txt") assignposttitles(askmeposts,askmeposttitles) askmeposttitles = None askmetags = loaddata(thezipfile, "tagdata_askme.txt") assigntags(askmeposts,askmetags) #askmecomments = loaddata(thezipfile, "commentdata_askme.txt") #mefi mefiposts = loaddata(thezipfile, "postdata_mefi.txt") mefiposttitles = loaddata(thezipfile, "posttitles_mefi.txt") assignposttitles(mefiposts,mefiposttitles) mefiposttitles = None mefitags = loaddata(thezipfile, "tagdata_mefi.txt") assigntags(mefiposts,mefitags) #meta metaposts = loaddata(thezipfile, "postdata_meta.txt") metaposttitles = loaddata(thezipfile, "posttitles_meta.txt") assignposttitles(metaposts,metaposttitles) metaposttitles = None metatags = loaddata(thezipfile, "tagdata_meta.txt") assigntags(metaposts,metatags) #music musicposts = loaddata(thezipfile, "postdata_music.txt") musicposttitles = loaddata(thezipfile, "posttitles_music.txt") assignposttitles(musicposts,musicposttitles) musicposttitles = None #musictags = loaddata(thezipfile, "tagdata_music.txt") #assigntags(musicposts,musictags)