wezm.net/importer/wordpress.rb

152 lines
4.1 KiB
Ruby
Raw Normal View History

require 'nokogiri'
require 'nanoc3'
module Importer
2009-11-18 02:02:37 +00:00
class Wordpress
2009-11-18 02:02:37 +00:00
def initialize(wordpress_export_path, nanoc_site_path)
@export_file = File.open(wordpress_export_path)
@export = Nokogiri::XML(@export_file)
2009-11-18 02:02:37 +00:00
@site = Nanoc3::Site.new(nanoc_site_path)
2009-11-18 02:35:09 +00:00
load_categories
load_tags
end
def load_categories
2009-11-19 08:24:53 +00:00
puts "Loading categories"
2009-11-18 02:35:09 +00:00
@categories = {}
@export.xpath('//rss/channel/wp:category').each do |category|
name = get(category, 'wp:cat_name')
parent = get(category, 'wp:category_parent')
parent = nil if parent.empty?
@categories[name] = {
:slug => get(category, 'wp:category_nicename'),
:name => name,
:parent => parent
}
end
end
def load_tags
2009-11-19 08:24:53 +00:00
puts "Loading tags"
2009-11-18 02:35:09 +00:00
@tags = {}
2009-11-19 08:24:53 +00:00
@export.xpath('//rss/channel/wp:tag').each do |tag|
slug = get(tag, 'wp:tag_slug')
@tags[slug] = {
:slug => slug,
:name => get(tag, 'wp:tag_name'),
}
end
2009-11-18 02:35:09 +00:00
end
def find_topmost_category(category)
return category if category[:parent].nil?
find_topmost_category(@categories[category[:parent]])
end
def run
# Loop over each post
@export.xpath('//rss/channel/item').each do |item|
item_type = item.xpath('wp:post_type').first.text
case item_type
when 'post'
process_post(item)
when 'page'
process_page(item)
when 'attachment'
process_attachment(item)
else
puts "Unknown post type: #{item_type}"
end
end
end
protected
2009-11-18 02:02:37 +00:00
def get(node, xpath)
2009-11-19 08:24:53 +00:00
elem = node.at_xpath(xpath)
2009-11-18 02:02:37 +00:00
elem ? elem.content : nil
end
def process_post(post)
puts "Processing post: #{post.css('title').first.text}"
2009-11-18 02:02:37 +00:00
content = get(post, 'content:encoded')
tags = []
post.css('category[domain=tag]').each do |tag|
if tag['nicename']
tags << tag['nicename']
else
tags << tag.text.downcase
end
end
categories = []
post.css('category[domain=category]').each do |category|
categories << category.text
end
2009-11-18 02:35:09 +00:00
categories.uniq!
2009-11-18 02:02:37 +00:00
2009-11-19 08:24:53 +00:00
begin
post_date = Date.strptime(get(post, 'wp:post_date_gmt'), "%Y-%m-%d %H:%M:%S")
rescue ArgumentError
post_date = Date.today
end
2009-11-18 02:02:37 +00:00
attributes = {
:tags => tags.uniq,
2009-11-18 02:35:09 +00:00
:categories => categories,
2009-11-18 02:02:37 +00:00
:permalink => get(post, 'link'),
:status => get(post, 'wp:status'),
:slug => get(post, 'wp:post_name'),
:post_id => get(post, 'wp:post_id').to_i,
:post_date => get(post, 'wp:post_date_gmt'),
2009-11-19 08:24:53 +00:00
:section => find_topmost_category(@categories[categories.first])[:slug],
:title => get(post, 'title'),
2009-11-18 02:02:37 +00:00
}
2009-11-18 02:35:09 +00:00
if attributes[:slug].empty?
2009-11-19 08:24:53 +00:00
puts "WARNING: Error post #{attributes[:post_id]} has no slug, generating one"
attributes[:slug] = attributes[:title].downcase.gsub(/[^0-9a-zA-Z]/, '-').gsub(/-{2,}/, '-')
2009-11-18 02:02:37 +00:00
end
2009-11-19 08:24:53 +00:00
path = ['', attributes[:section], post_date.year, ("%02d" % post_date.month), attributes[:slug], ''].join('/')
2009-11-18 02:35:09 +00:00
2009-11-19 08:24:53 +00:00
# require 'pp'
# pp attributes
2009-11-18 02:35:09 +00:00
# add_item(content, attributes, identifier)
end
2009-11-18 02:02:37 +00:00
def process_page(page)
puts "Processing page: #{page.css('title').first.text}"
end
def process_attachment(attachment)
puts "Processing attachment"
2009-11-19 08:24:53 +00:00
url = get(attachment, 'guid')
end
2009-11-18 02:02:37 +00:00
def add_item(content, attributes, identifier)
# content = row['post_content']
# attributes = {
# :title => row['post_title'],
# :published_on => row['post_date_gmt'],
# :modified_on => row['post_modified_gmt'],
# :status => row['post_status'],
# :excerpt => row['post_excerpt']
# }
# identifier = '/posts/' + post_date.year.to_s + '/' + post_date.month.to_s + '/' + post_name + '/'
@site.data_sources.first.create_item(content, attributes, identifier)
puts "Added item at #{identifier}"
end
end
2009-11-18 02:02:37 +00:00
end