| MetaFilter's site and server can always use upgrades of hardware, software, and bandwidth, as well as more stable funding for continued support of its small but high-skilled moderation and backend team! If you'd like to chip in, you can donate to Metafilter. |
Mefipost.py
From Mefi Wiki
Jump to navigationJump to search
#!/usr/bin/python
"""
.. module:: mefipost
:synopsis: A module for reading Metafilter Infodump Post files
.. moduleauthor:: Tim Kolar <tkolar@pobox.com>
This module populates a dictionary with MFPPost objects keyed
by post ids. The MFPPost object looks like this:
postid = (int) Post ID
userid = (int) User ID
datestamp = (datetime) The date and time a post was made
category = (int) The category of the post. Applies to Askme posts
comments = (int) Total number of comments made in the post
favorites = (int) Number of times the post was favorited
deleted = (int) Boolean marking if the post was deleted or not
delete_reason = (string) Deletion reason
context = None (anything) context passed by the user when reading a file
There is only one function, mfp_readfile(filename, context). It returns
the date and time the file was generated and the dictionary containing
all of the MFPPost objects.
Metafilter Infodump files can be retrieved from:
http://stuff.metafilter.com/infodump/
"""
#
# Example data
#
#Sat Aug 18 00:35:26 2012
#postid userid datestamp category comments favorites deletedreason
#19 1 1999-07-14 15:03:04.930 0 116 104 0 [NULL]
#25 1 1999-07-15 09:37:51.770 0 6 1 0 [NULL]
#26 16 1999-07-15 09:54:26.280 0 4 0 0 [NULL]
import sys
import datetime
class MFPPost:
""" This isn't so much a class as a C type structure with a print function
"""
postid = 0
userid = 0
datestamp = datetime.datetime(datetime.MINYEAR, 1, 1)
category = 0
comments = 0
favorites = 0
deleted = 0
delete_reason = ""
context = None
def __repr__(self):
ret = "class MFPPost:\n"
ret += "\tpostid = %d\n" % self.postid
ret += "\tuserid = %d\n" % self.userid
ret += "\tdatestamp = %s\n" % self.datestamp
ret += "\tcategory = %d\n" % self.category
ret += "\tcomments = %d\n" % self.comments
ret += "\tfavorites = %d\n" % self.favorites
ret += "\tdeleted = %d\n" % self.deleted
ret += "\tdelete_reason = %s\n" % self.delete_reason.strip()
try:
context_string = "%s" % self.context
except:
context_string = "<not a string>"
ret += "\tcontext = %s\n" % self.context
return(ret)
def mfp_readfile(filename, context):
""" Read an Infodump post file into a dictionary
:param filename: The name of the file to read from.
:type name: str.
:param context: Whatever context you wish to attach to each entry
:type context: Whatever you like.
:returns (datetime, dictionary) -- The date that the infodump was created and a dictionary of MFPPost() objects keyed on the postid.
"""
file = open(filename)
dictionary = {}
pulldate_text = file.readline().strip()
pulldate_format = "%a %b %d %H:%M:%S %Y"
pulldate = datetime.datetime.strptime(pulldate_text, pulldate_format)
file.readline() # ignore the second line
for line in file:
fields = line.split('\t', 8)
# Occasionally there will be a corrupt line in the file, which can
# be detected by the lack of 7 tabs. Ignore it.
if (len(fields) != 8):
continue
(postid, userid, datestamp, category, comments, favorites, deleted, reason) = fields
post = MFPPost()
post.postid = int(postid)
post.userid = int(userid)
datestamp_format = "%Y-%m-%d %H:%M:%S.%f"
post.datestamp = datetime.datetime.strptime(datestamp, datestamp_format)
post.comments = int(comments)
post.favorites = int(favorites)
post.deleted = int(deleted)
post.delete_reason = reason
post.context = context
dictionary[post.postid] = post
return(pulldate, dictionary)
#
# test code
#
def test():
(mefi_pulldate, mefi_dict) = mfp_readfile("postdata_mefi.txt", None)
(askme_pulldate, askme_dict) = mfp_readfile("postdata_askme.txt", None)
print mefi_pulldate
print askme_pulldate
print mefi_dict[19]
if __name__ == '__main__':
test()