Simplephp blog converter available

1 mypalmike Apr 21, 2008 10:30

#
#
# Python script to convert simplephpblog to movabletype export format.
# I wrote this for importing my simplephpblog into b2evolution, which has a movabletype
# importer, so it has not actually been tested with movabletype.
#
# Directions:
# 1) Download your 'content' directory tree from your hosting provider.  If you don't know
#    how to do this, try googling "wget" as a starting point.  If you do use wget, be sure
#    to enable recursion (-r) and expand the maximum recursion depth (-l 20)
# 2) Place this script in the same directory as the content directory is in
# 3) Modify defaults, below
# 4) Cross fingers
# 5) Run the script.  The output is on standard io, so you will need to redirect it to a file,
#    e.g. "python sphp2mt.py > myblogcontent.txt"
#
# By mypalmike
#

import os
import re
from datetime import datetime, timedelta

# Set your defaults here
defaultauthor = 'mypalmike'
alwaysoverrideauthor = 1
categorymap = { 1 : 'First Category', 2 : 'Second Category', 3 : 'Third Category', 4: 'Fourth Category' }
imagesurl = 'http://www.mycoolsite.com/blogs/media/blogs/blog1'
delta = timedelta( hours = -4 ) # Offset from UTC - Too lazy to do real timezones
spammers = set( ['spammer1', 'spammer2' ] )

# You should only need to modify below to add functionality and/or fix bugs.

# Compile regular expressions once here.
regex_simpletag = re.compile( '\[(/?([bi]|h\d|strike))\]' )
regex_moretag = re.compile( '\[more\]' )
regex_imgtag = re.compile( '\[img=(.*?)( popup=(true|false))?\]' )
regex_urltag = re.compile( '\[url=(.*?)\]((.|[\r\n])*?)\[/url\]' )
regex_htmltag = re.compile('\[html\]((.|[\r\n])*?)\[/html\]' )
regex_images = re.compile( 'images/(.*)' )
regex_sglquote = re.compile( ''' )
regex_dblquote = re.compile( '&quot;' )
regex_ampersand = re.compile( '&amp;' )
regex_lt = re.compile( '&lt;' )
regex_gt = re.compile( '&gt;' )

# Callback for filtering [html] tags
def htmlfilter( match ):
	htmlCode = match.group(1)
	htmlCode = regex_lt.sub( '<', htmlCode )
	htmlCode = regex_gt.sub( '>', htmlCode )
	return htmlCode

# Callback for filtering [img] tags
def imgfilter( match ):
	srcimagesurl = match.group(1)
	return '<img src="' + regex_images.sub( r'' + imagesurl + r'/\1', srcimagesurl ) + '"/>'

# Get a dictionary of key-value pairs from bar-separated string
def parse( keyvaluepairs ):
	dict = {}
	parts = keyvaluepairs.split( '|' )
	idx = 0
	length = len(parts)
	while idx < length:
		dict[parts[idx]] = parts[idx+1]
		idx += 2
	return dict

def filterBody( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_moretag.sub( '[teaserbreak]', newtext )
	newtext = regex_imgtag.sub( imgfilter, newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	newtext = regex_htmltag.sub( htmlfilter, newtext )
	return newtext

def filterComment( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	return newtext

def emitentry( entry ):
	dict = parse( entry )
	author = defaultauthor
	if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ):
		author = dict ['AUTHOR']
	title = dict[ 'SUBJECT' ]
	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
	category = 'Uncategorized'
	if dict.has_key( 'CATEGORIES' ):
		categoryNum = int( dict['CATEGORIES'] )
		if categorymap.has_key( categoryNum ):
			category = categorymap[ categoryNum ]
	# Author must come first for b2evolution importer...
	print 'AUTHOR:', author
	print 'TITLE:', title
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	print 'PRIMARY CATEGORY:', category
#	print 'STATUS: publish'
#	print 'ALLOW COMMENTS: 1'
#	print 'ALLOW PINGS: 0'
	print '-----'
	print 'BODY:'
	print filterBody( dict['CONTENT'] )
	print '-----'

def emitcomment( comment ):
	dict = parse( comment )

	if( dict ['NAME'] in spammers ):
		return

	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta

	print 'COMMENT:'
	print 'AUTHOR:', dict ['NAME']
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	if( dict.has_key('URL') ):
		print 'URL:', dict[ 'URL' ]
	if( dict.has_key('IP') ):
		print 'IP:', dict[ 'IP' ]
	if( dict.has_key('EMAIL') ):
		print 'EMAIL:', dict[ 'EMAIL' ]
	print filterComment( dict[ 'CONTENT' ] )
	print '-----'

def convert():
	for root, dirs, files in os.walk('content'):
		for name in files:
			if name.startswith( 'entry' ):
				entrydir = name.split( '.' )[0]
				entryfile = open( root + '/' + name, 'r' )
				entryval = entryfile.read()
				entryfile.close()
				emitentry( entryval )
				for root2, dirs2, files2 in os.walk(root + '/' + entrydir):
					for name2 in files2:
						if name2.startswith( 'comment' ):
							commentfile = open( root2 + '/' + name2, 'r' )
							commentval = commentfile.read()
							commentfile.close()
							emitcomment( commentval )
				print '--------'

# Go
convert()

2 mochababy Apr 21, 2008 19:09

mypalmike wrote:

#
#
# Python script to convert simplephpblog to movabletype export format.
# I wrote this for importing my simplephpblog into b2evolution, which has a movabletype
# importer, so it has not actually been tested with movabletype.
#
# Directions:
# 1) Download your 'content' directory tree from your hosting provider.  If you don't know
#    how to do this, try googling "wget" as a starting point.  If you do use wget, be sure
#    to enable recursion (-r) and expand the maximum recursion depth (-l 20)
# 2) Place this script in the same directory as the content directory is in
# 3) Modify defaults, below
# 4) Cross fingers
# 5) Run the script.  The output is on standard io, so you will need to redirect it to a file,
#    e.g. "python sphp2mt.py > myblogcontent.txt"
#
# By mypalmike
#

import os
import re
from datetime import datetime, timedelta

# Set your defaults here
defaultauthor = 'mypalmike'
alwaysoverrideauthor = 1
categorymap = { 1 : 'First Category', 2 : 'Second Category', 3 : 'Third Category', 4: 'Fourth Category' }
imagesurl = 'http://www.mycoolsite.com/blogs/media/blogs/blog1'
delta = timedelta( hours = -4 ) # Offset from UTC - Too lazy to do real timezones
spammers = set( ['spammer1', 'spammer2' ] )

# You should only need to modify below to add functionality and/or fix bugs.

# Compile regular expressions once here.
regex_simpletag = re.compile( '\[(/?([bi]|h\d|strike))\]' )
regex_moretag = re.compile( '\[more\]' )
regex_imgtag = re.compile( '\[img=(.*?)( popup=(true|false))?\]' )
regex_urltag = re.compile( '\[url=(.*?)\]((.|[\r\n])*?)\[/url\]' )
regex_htmltag = re.compile('\[html\]((.|[\r\n])*?)\[/html\]' )
regex_images = re.compile( 'images/(.*)' )
regex_sglquote = re.compile( ''' )
regex_dblquote = re.compile( '&quot;' )
regex_ampersand = re.compile( '&amp;' )
regex_lt = re.compile( '&lt;' )
regex_gt = re.compile( '&gt;' )

# Callback for filtering [html] tags
def htmlfilter( match ):
	htmlCode = match.group(1)
	htmlCode = regex_lt.sub( '<', htmlCode )
	htmlCode = regex_gt.sub( '>', htmlCode )
	return htmlCode

# Callback for filtering [img] tags
def imgfilter( match ):
	srcimagesurl = match.group(1)
	return '<img src="' + regex_images.sub( r'' + imagesurl + r'/\1', srcimagesurl ) + '"/>'

# Get a dictionary of key-value pairs from bar-separated string
def parse( keyvaluepairs ):
	dict = {}
	parts = keyvaluepairs.split( '|' )
	idx = 0
	length = len(parts)
	while idx < length:
		dict[parts[idx]] = parts[idx+1]
		idx += 2
	return dict

def filterBody( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_moretag.sub( '<!--more-->', newtext )
	newtext = regex_imgtag.sub( imgfilter, newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	newtext = regex_htmltag.sub( htmlfilter, newtext )
	return newtext

def filterComment( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	return newtext

def emitentry( entry ):
	dict = parse( entry )
	author = defaultauthor
	if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ):
		author = dict ['AUTHOR']
	title = dict[ 'SUBJECT' ]
	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
	category = 'Uncategorized'
	if dict.has_key( 'CATEGORIES' ):
		categoryNum = int( dict['CATEGORIES'] )
		if categorymap.has_key( categoryNum ):
			category = categorymap[ categoryNum ]
	# Author must come first for b2evolution importer...
	print 'AUTHOR:', author
	print 'TITLE:', title
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	print 'PRIMARY CATEGORY:', category
#	print 'STATUS: publish'
#	print 'ALLOW COMMENTS: 1'
#	print 'ALLOW PINGS: 0'
	print '-----'
	print 'BODY:'
	print filterBody( dict['CONTENT'] )
	print '-----'

def emitcomment( comment ):
	dict = parse( comment )

	if( dict ['NAME'] in spammers ):
		return

	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta

	print 'COMMENT:'
	print 'AUTHOR:', dict ['NAME']
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	if( dict.has_key('URL') ):
		print 'URL:', dict[ 'URL' ]
	if( dict.has_key('IP') ):
		print 'IP:', dict[ 'IP' ]
	if( dict.has_key('EMAIL') ):
		print 'EMAIL:', dict[ 'EMAIL' ]
	print filterComment( dict[ 'CONTENT' ] )
	print '-----'

def convert():
	for root, dirs, files in os.walk('content'):
		for name in files:
			if name.startswith( 'entry' ):
				entrydir = name.split( '.' )[0]
				entryfile = open( root + '/' + name, 'r' )
				entryval = entryfile.read()
				entryfile.close()
				emitentry( entryval )
				for root2, dirs2, files2 in os.walk(root + '/' + entrydir):
					for name2 in files2:
						if name2.startswith( 'comment' ):
							commentfile = open( root2 + '/' + name2, 'r' )
							commentval = commentfile.read()
							commentfile.close()
							emitcomment( commentval )
				print '--------'

# Go
convert()

i haven't tried this for myself but it looks like you are blazing the trail for a lot of people that would like to switch but don't want to loose their audience...good job!

~mochababy~

3 mypalmike Apr 21, 2008 20:25

Thanks for your kind feedback.

The script got slightly messed up when posting here...

regex_sglquote = re.compile( 'XXX' )

'XXX' here should be '[ampersand]039[semicolon]' instead of just a single quote character.

There may be other problems - I happened to just notice this one. I will place the script on my site tonight so that it can be downloaded without this issue. See the "external links" section of www.mypalmike.com.

FYI... I switched to b2evolution because sphpblog was getting very slow with the growing size of my content. People who regularly visited and commented sent me email complaining about the performance - things like one minute delays after clicking "post comment". I presumed, apparently correctly, it was due to the fact that the content in sphpblog is stored in files rather than a db, making efficient indexing impossible. The script I posted converted my ~400 entries and ~5000 comments (including comment spam... grrr!) with no obvious problems. The resulting site is leaps and bounds more responsive. :) Thanks to the b2evolution developers for making such a cool piece of software. (Simplephpblog is also a very cool piece of software, I might add.)

Even though the script is meant to be fairly generic, there is certainly stuff that will not convert properly with this script, as it was designed somewhat based on the tags that existed in my own content. For instance, all my img tags point to images on the relative path 'images/' on my site, so img tags which reference external sites will not translate properly.

4 r_kleineisel Sep 23, 2008 16:54

I have a small addition to this great script. This will import posts with more than 1 category. Probably not very elegant, but I don't know anything of python. You should have written this in perl ;-)

def emitentry( entry ):
dict = parse( entry )
author = defaultauthor
if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ):
author = dict ['AUTHOR']
title = dict[ 'SUBJECT' ]
epochTimestamp = float( dict[ 'DATE' ] )
timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
# Author must come first for b2evolution importer...
print 'AUTHOR:', author
print 'TITLE:', title
print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
prim_category = 'Uncategorized'
if dict.has_key( 'CATEGORIES' ):
cats = dict['CATEGORIES']
cats = cats.split(',')
prim_cat = cats[0]
cats.remove(prim_cat)
prim_cat_num = int( prim_cat )
if categorymap.has_key( prim_cat_num ):
prim_category = categorymap[ prim_cat_num ]
for cat in cats:
cat_num = int(cat)
if categorymap.has_key( cat ):
catname = categorymap[ cat ]
print "CATEGORY:", catname
print 'PRIMARY CATEGORY:', prim_category
# print 'STATUS: publish'
# print 'ALLOW COMMENTS: 1'
# print 'ALLOW PINGS: 0'
print '-----'
print 'BODY:'
print filterBody( dict['CONTENT'] )
print '-----'

5 amaxson Jul 09, 2010 06:47

This is exactly what I was looking for. Unfortunately when I try to execute the script it makes it through three blog entries before returning the following error.

File "sphpblog.py", line 66, in parse
dict[parts[idx]] = parts[idx+1]
IndexError: list index out of range

Hopefully someone here speaks Python, because I don't, and I have 7 years of blog entries to convert. Any ideas?

Form is loading...

Simplephp blog converter available

1 mypalmike Apr 21, 2008 10:30

2 mochababy Apr 21, 2008 19:09

3 mypalmike Apr 21, 2008 20:25

4 r_kleineisel Sep 23, 2008 16:54

5 amaxson Jul 09, 2010 06:47

About b2evolution

Downloads

About us

Webhosting Guide

Docs & Support

Other

Stay in touch