MetaFilter's site and server can always use upgrades of hardware, software, and bandwidth, as well as more stable funding for continued support of its small but high-skilled moderation and backend team! If you'd like to chip in, you can donate to Metafilter.

Mefipost.py

From Mefi Wiki
Jump to: navigation, search
#!/usr/bin/python

"""

.. module:: mefipost
   :synopsis: A module for reading Metafilter Infodump Post files

.. moduleauthor:: Tim Kolar <tkolar@pobox.com>

	This module populates a dictionary with MFPPost objects keyed
	by post ids.  The MFPPost object looks like this:

	postid = (int) Post ID
	userid = (int) User ID
	datestamp = (datetime) The date and time a post was made
	category = (int) The category of the post. Applies to Askme posts
	comments = (int) Total number of comments made in the post
	favorites = (int) Number of times the post was favorited
	deleted = (int) Boolean marking if the post was deleted or not
	delete_reason = (string) Deletion reason
	context = None (anything) context passed by the user when reading a file

	There is only one function, mfp_readfile(filename, context).  It returns
	the date and time the file was generated and the dictionary containing
	all of the MFPPost objects.

	Metafilter Infodump files can be retrieved from:
	
		http://stuff.metafilter.com/infodump/
"""

#
#  Example data
#
#Sat Aug 18 00:35:26 2012
#postid	userid	datestamp	category	comments	favorites	deletedreason
#19	1	1999-07-14 15:03:04.930	0	116	104	0	[NULL]
#25	1	1999-07-15 09:37:51.770	0	6	1	0	[NULL]
#26	16	1999-07-15 09:54:26.280	0	4	0	0	[NULL]

import sys
import datetime

class MFPPost:
	""" This isn't so much a class as a C type structure with a print function
	"""
	postid = 0
	userid = 0
	datestamp = datetime.datetime(datetime.MINYEAR, 1, 1)
	category = 0
	comments = 0
	favorites = 0
	deleted = 0
	delete_reason = ""
	context = None

	def __repr__(self):
		ret = "class MFPPost:\n"
		ret += "\tpostid = %d\n" % self.postid
		ret += "\tuserid = %d\n" % self.userid
		ret += "\tdatestamp = %s\n" % self.datestamp
		ret += "\tcategory = %d\n" % self.category
		ret += "\tcomments = %d\n" % self.comments
		ret += "\tfavorites = %d\n" % self.favorites
		ret += "\tdeleted = %d\n" % self.deleted
		ret += "\tdelete_reason = %s\n" % self.delete_reason.strip()
		try:
			context_string = "%s" % self.context
		except:
			context_string = "<not a string>"

		ret += "\tcontext = %s\n" % self.context
		return(ret)

def mfp_readfile(filename, context):
	""" Read an Infodump post file into a dictionary

	:param filename: The name of the file to read from.
	:type name: str.
	:param context: Whatever context you wish to attach to each entry
	:type context: Whatever you like.  
	:returns (datetime, dictionary) -- The date that the infodump was created and a dictionary of MFPPost() objects keyed on the postid.
	"""

	file = open(filename)
	dictionary = {}

	pulldate_text = file.readline().strip()
	pulldate_format = "%a %b %d %H:%M:%S %Y"
	pulldate =  datetime.datetime.strptime(pulldate_text, pulldate_format)
	file.readline() # ignore the second line

	for line in file:
		fields = line.split('\t', 8)

		# Occasionally there will be a corrupt line in the file, which can
		# be detected by the lack of 7 tabs. Ignore it.
		if (len(fields) != 8):
			continue

		(postid, userid, datestamp, category, comments, favorites, deleted, reason) = fields
		post = MFPPost()
		post.postid = int(postid)
		post.userid = int(userid)
		datestamp_format = "%Y-%m-%d %H:%M:%S.%f"
		post.datestamp = datetime.datetime.strptime(datestamp, datestamp_format)
		post.comments = int(comments)
		post.favorites = int(favorites)
		post.deleted = int(deleted)
		post.delete_reason = reason
		post.context = context
		dictionary[post.postid] = post

	return(pulldate, dictionary)




#
# test code
#
def test():
	(mefi_pulldate, mefi_dict) = mfp_readfile("postdata_mefi.txt", None)
	(askme_pulldate, askme_dict) = mfp_readfile("postdata_askme.txt", None)
	print mefi_pulldate
	print askme_pulldate
	print mefi_dict[19]

if __name__ == '__main__':
	test()