MetaFilter's site and server can always use upgrades of hardware, software, and bandwidth, as well as more stable funding for continued support of its small but high-skilled moderation and backend team! If you'd like to chip in, you can donate to Metafilter. |
Mefipost.py
From Mefi Wiki
Jump to navigationJump to search#!/usr/bin/python """ .. module:: mefipost :synopsis: A module for reading Metafilter Infodump Post files .. moduleauthor:: Tim Kolar <tkolar@pobox.com> This module populates a dictionary with MFPPost objects keyed by post ids. The MFPPost object looks like this: postid = (int) Post ID userid = (int) User ID datestamp = (datetime) The date and time a post was made category = (int) The category of the post. Applies to Askme posts comments = (int) Total number of comments made in the post favorites = (int) Number of times the post was favorited deleted = (int) Boolean marking if the post was deleted or not delete_reason = (string) Deletion reason context = None (anything) context passed by the user when reading a file There is only one function, mfp_readfile(filename, context). It returns the date and time the file was generated and the dictionary containing all of the MFPPost objects. Metafilter Infodump files can be retrieved from: http://stuff.metafilter.com/infodump/ """ # # Example data # #Sat Aug 18 00:35:26 2012 #postid userid datestamp category comments favorites deletedreason #19 1 1999-07-14 15:03:04.930 0 116 104 0 [NULL] #25 1 1999-07-15 09:37:51.770 0 6 1 0 [NULL] #26 16 1999-07-15 09:54:26.280 0 4 0 0 [NULL] import sys import datetime class MFPPost: """ This isn't so much a class as a C type structure with a print function """ postid = 0 userid = 0 datestamp = datetime.datetime(datetime.MINYEAR, 1, 1) category = 0 comments = 0 favorites = 0 deleted = 0 delete_reason = "" context = None def __repr__(self): ret = "class MFPPost:\n" ret += "\tpostid = %d\n" % self.postid ret += "\tuserid = %d\n" % self.userid ret += "\tdatestamp = %s\n" % self.datestamp ret += "\tcategory = %d\n" % self.category ret += "\tcomments = %d\n" % self.comments ret += "\tfavorites = %d\n" % self.favorites ret += "\tdeleted = %d\n" % self.deleted ret += "\tdelete_reason = %s\n" % self.delete_reason.strip() try: context_string = "%s" % self.context except: context_string = "<not a string>" ret += "\tcontext = %s\n" % self.context return(ret) def mfp_readfile(filename, context): """ Read an Infodump post file into a dictionary :param filename: The name of the file to read from. :type name: str. :param context: Whatever context you wish to attach to each entry :type context: Whatever you like. :returns (datetime, dictionary) -- The date that the infodump was created and a dictionary of MFPPost() objects keyed on the postid. """ file = open(filename) dictionary = {} pulldate_text = file.readline().strip() pulldate_format = "%a %b %d %H:%M:%S %Y" pulldate = datetime.datetime.strptime(pulldate_text, pulldate_format) file.readline() # ignore the second line for line in file: fields = line.split('\t', 8) # Occasionally there will be a corrupt line in the file, which can # be detected by the lack of 7 tabs. Ignore it. if (len(fields) != 8): continue (postid, userid, datestamp, category, comments, favorites, deleted, reason) = fields post = MFPPost() post.postid = int(postid) post.userid = int(userid) datestamp_format = "%Y-%m-%d %H:%M:%S.%f" post.datestamp = datetime.datetime.strptime(datestamp, datestamp_format) post.comments = int(comments) post.favorites = int(favorites) post.deleted = int(deleted) post.delete_reason = reason post.context = context dictionary[post.postid] = post return(pulldate, dictionary) # # test code # def test(): (mefi_pulldate, mefi_dict) = mfp_readfile("postdata_mefi.txt", None) (askme_pulldate, askme_dict) = mfp_readfile("postdata_askme.txt", None) print mefi_pulldate print askme_pulldate print mefi_dict[19] if __name__ == '__main__': test()