diff --git a/lib/tasks/wordpress.rake b/lib/tasks/wordpress.rake index e7908a5..f544c72 100644 --- a/lib/tasks/wordpress.rake +++ b/lib/tasks/wordpress.rake @@ -5,8 +5,7 @@ namespace :wordpress do task :reset_blog do Rake::Task["environment"].invoke - %w(taggings tags blog_comments blog_categories blog_categories_blog_posts - blog_posts).each do |table_name| + %w(posts post_translations taggings tags).each do |table_name| p "Truncating #{table_name} ..." ActiveRecord::Base.connection.execute "DELETE FROM #{table_name}" end @@ -16,18 +15,23 @@ namespace :wordpress do desc "import blog data from a WordPressImport XML dump" task :import_blog, :file_name do |task, params| Rake::Task["environment"].invoke + p "Loading XML from #{params[:file_name]} ..." dump = WordPressImport::Dump.new(params[:file_name]) - dump.authors.each(&:to_refinery) + p "Importing #{dump.authors.count} authors ..." + dump.authors.each(&:to_rails) + # by default, import all; unless $ONLY_PUBLISHED = "true" only_published = ENV['ONLY_PUBLISHED'] == 'true' ? true : false - dump.posts(only_published).each(&:to_refinery) + p "Importing #{dump.posts(only_published).count} posts ..." + + if only_published + p "(only published posts)" + else + p "(export ONLY_PUBLISHED=true to import only published posts)" + end - WordPressImport::Post.create_blog_page_if_necessary - - ENV["MODEL"] = 'BlogPost' - Rake::Task["friendly_id:redo_slugs"].invoke - ENV.delete("MODEL") + dump.posts(only_published).each(&:to_rails) end desc "reset blog tables and then import blog data from a WordPressImport XML dump" @@ -38,53 +42,53 @@ namespace :wordpress do end - desc "Reset the cms relevant tables for a clean import" - task :reset_pages do - Rake::Task["environment"].invoke + # desc "Reset the cms relevant tables for a clean import" + # task :reset_pages do + # Rake::Task["environment"].invoke - %w(page_part_translations page_translations page_parts pages).each do |table_name| - p "Truncating #{table_name} ..." - ActiveRecord::Base.connection.execute "DELETE FROM #{table_name}" - end - end + # %w(page_part_translations page_translations page_parts pages).each do |table_name| + # p "Truncating #{table_name} ..." + # ActiveRecord::Base.connection.execute "DELETE FROM #{table_name}" + # end + # end - desc "import cms data from a WordPress XML dump" - task :import_pages, :file_name do |task, params| - Rake::Task["environment"].invoke - dump = WordPressImport::Dump.new(params[:file_name]) + # desc "import cms data from a WordPress XML dump" + # task :import_pages, :file_name do |task, params| + # Rake::Task["environment"].invoke + # dump = WordPressImport::Dump.new(params[:file_name]) - only_published = ENV['ONLY_PUBLISHED'] == 'true' ? true : false - dump.pages(only_published).each(&:to_refinery) + # only_published = ENV['ONLY_PUBLISHED'] == 'true' ? true : false + # dump.pages(only_published).each(&:to_rails) - # After all pages are persisted we can now create the parent - child - # relationships. This is necessary, as WordPress doesn't dump the pages in - # a correct order. - dump.pages(only_published).each do |dump_page| - page = ::Page.find(dump_page.post_id) - page.parent_id = dump_page.parent_id - page.save! - end + # # After all pages are persisted we can now create the parent - child + # # relationships. This is necessary, as WordPress doesn't dump the pages in + # # a correct order. + # dump.pages(only_published).each do |dump_page| + # page = ::Page.find(dump_page.post_id) + # page.parent_id = dump_page.parent_id + # page.save! + # end - WordPressImport::Post.create_blog_page_if_necessary + # WordPressImport::Post.create_blog_page_if_necessary - ENV["MODEL"] = 'Page' - Rake::Task["friendly_id:redo_slugs"].invoke - ENV.delete("MODEL") - end + # ENV["MODEL"] = 'Page' + # Rake::Task["friendly_id:redo_slugs"].invoke + # ENV.delete("MODEL") + # end - desc "reset cms tables and then import cms data from a WordPress XML dump" - task :reset_and_import_pages, :file_name do |task, params| - Rake::Task["environment"].invoke - Rake::Task["wordpress:reset_pages"].invoke - Rake::Task["wordpress:import_pages"].invoke(params[:file_name]) - end + # desc "reset cms tables and then import cms data from a WordPress XML dump" + # task :reset_and_import_pages, :file_name do |task, params| + # Rake::Task["environment"].invoke + # Rake::Task["wordpress:reset_pages"].invoke + # Rake::Task["wordpress:import_pages"].invoke(params[:file_name]) + # end desc "Reset the media relevant tables for a clean import" task :reset_media do Rake::Task["environment"].invoke - %w(images resources).each do |table_name| + %w(rich_rich_files).each do |table_name| p "Truncating #{table_name} ..." ActiveRecord::Base.connection.execute "DELETE FROM #{table_name}" end @@ -95,13 +99,14 @@ namespace :wordpress do Rake::Task["environment"].invoke dump = WordPressImport::Dump.new(params[:file_name]) - attachments = dump.attachments.each(&:to_refinery) + p "Importing #{dump.attachments.each_slice(25).first.count} attachments ..." + attachments = dump.attachments.each_slice(25).first.each(&:to_rails) + p "Errors were encountered: #{$ATTACHMENT_EXCEPTIONS.inspect}" unless $ATTACHMENT_EXCEPTIONS.blank? - # parse all created BlogPost and Page bodys and replace the old wordpress media uls + # parse all created Post and Page bodys and replace the old wordpress media urls # with the newly created ones - attachments.each do |attachment| - attachment.replace_url - end + p "Replacing attachment URLs found in posts/pages ..." + attachments.each(&:replace_url) end desc "reset media tables and then import media data from a WordPress XML dump" @@ -115,7 +120,16 @@ namespace :wordpress do task :full_import, :file_name do |task, params| Rake::Task["environment"].invoke Rake::Task["wordpress:reset_and_import_blog"].invoke(params[:file_name]) - Rake::Task["wordpress:reset_and_import_pages"].invoke(params[:file_name]) - Rake::Task["wordpress:reset_import_and_replace_media"].invoke(params[:file_name]) + #Rake::Task["wordpress:reset_and_import_pages"].invoke(params[:file_name]) + #Rake::Task["wordpress:reset_import_and_replace_media"].invoke(params[:file_name]) + Rake::Task["wordpress:import_and_replace_media"].invoke(params[:file_name]) + end + + + desc "Local First master import (no resets)" + task :lfa_import, :file_name do |task, params| + Rake::Task["environment"].invoke + Rake::Task["wordpress:import_blog"].invoke(params[:file_name]) + Rake::Task["wordpress:import_and_replace_media"].invoke(params[:file_name]) end end diff --git a/lib/wordpress/attachment.rb b/lib/wordpress/attachment.rb index 9ee68d0..dbead03 100644 --- a/lib/wordpress/attachment.rb +++ b/lib/wordpress/attachment.rb @@ -1,8 +1,8 @@ module WordPressImport class Attachment attr_reader :node - attr_reader :refinery_image - attr_reader :refinery_resource + attr_reader :paperclip_image + attr_reader :paperclip_file def initialize(node) @node = node @@ -40,42 +40,79 @@ module WordPressImport url.match /\.(png|jpg|jpeg|gif)$/ end - def to_refinery - if image? - to_image - else - to_resource + def to_rails + begin + if image? + to_image + else + to_file + end + rescue Exception => ex + message = "ERROR saving attachment #{url} -- #{ex.message}" + p message + $ATTACHMENT_EXCEPTIONS = [] if $ATTACHMENT_EXCEPTIONS.blank? + $ATTACHMENT_EXCEPTIONS << message end end def replace_url + @occurrance_count = 0 if image? replace_image_url else replace_resource_url end + p "Replaced #{@occurrance_count} occurrances of #{url}" end private - def to_image - image = ::Image.new - image.created_at = post_date - image.image_url = url - image.save! + def rich_file_clean_file_name(full_file_name) + extension = File.extname(full_file_name).gsub(/^\.+/, '') + filename = full_file_name.gsub(/\.#{extension}$/, '') + + filename = CGI::unescape(filename) + filename = CGI::unescape(filename) + + extension = extension.downcase + filename = filename.downcase.gsub(/[^a-z0-9]+/i, '-') + + "#{filename}.#{extension}" + end - @refinery_image = image + def to_image + # avoid duplicates; use our storage system's filename cleaner for lookup + image = ::Rich::RichFile.find_or_initialize_by(rich_file_file_name: rich_file_clean_file_name(file_name)) + + if image.rich_file.instance.id.blank? + p "Importing image #{file_name}" + image.simplified_type = "image" + image.created_at = post_date + image.rich_file = URI.parse(url) + image.save! + else + p "image #{file_name} already exists..." + end + + @paperclip_image = image image end - def to_resource - resource = ::Resource.new - resource.created_at = post_date - resource.file_url = url - resource.save! + def to_file + # avoid duplicates; use our storage system's filename cleaner for lookup + file = ::Rich::RichFile.find_or_initialize_by(rich_file_file_name: rich_file_clean_file_name(file_name)) - @refinery_resource = resource - resource + if file.rich_file.instance.id.blank? + p "Importing file #{file_name}" + file.created_at = post_date + file.rich_file = URI.parse(url) if file.rich_file.blank? + file.save! + else + p "file #{file_name} already exists..." + end + + @paperclip_file = file + file end def replace_image_url @@ -89,24 +126,26 @@ module WordPressImport end def replace_image_url_in_blog_posts - replace_url_in_blog_posts(refinery_image.image.url) + replace_url_in_blog_posts(paperclip_image.rich_file.url) end def replace_image_url_in_pages - replace_url_in_pages(refinery_image.image.url) + replace_url_in_pages(paperclip_image.rich_file.url) end def replace_resource_url_in_blog_posts - replace_url_in_blog_posts(refinery_resource.file.url) + replace_url_in_blog_posts(paperclip_file.rich_file.url) end def replace_resource_url_in_pages - replace_url_in_pages(refinery_resource.file.url) + replace_url_in_pages(paperclip_file.rich_file.url) end def replace_url_in_blog_posts(new_url) - ::BlogPost.all.each do |post| - if (! post.body.empty?) && post.body.include?(url) + ::Post.all.each do |post| + byebug if post.id == 168 + if ((! post.body.empty?) && post.body.include?(url)) + @occurrance_count++ post.body = post.body.gsub(url_pattern, new_url) post.save! end @@ -115,10 +154,13 @@ module WordPressImport def replace_url_in_pages(new_url) ::Page.all.each do |page| - page.parts.each do |part| - if (! part.body.to_s.blank?) && part.body.include?(url) - part.body = part.body.gsub(url_pattern, new_url) - part.save! + page.translations.each do |translation| + translation.parts.each do |part| + if (! part.content.to_s.blank?) && part.content.include?(url) + @occurrance_count++ + part.content = part.content.gsub(url_pattern, new_url) + part.save! + end end end end diff --git a/lib/wordpress/dump.rb b/lib/wordpress/dump.rb index c5ceb04..0aa7347 100644 --- a/lib/wordpress/dump.rb +++ b/lib/wordpress/dump.rb @@ -3,15 +3,17 @@ module WordPressImport attr_reader :doc def initialize(file_name) - file_name = File.expand_path(file_name) - - raise "Given file '#{file_name}' no file or not readable." \ - unless File.file?(file_name) && File.readable?(file_name) + begin + file_name = File.expand_path(file_name) + raise "error" unless File.file?(file_name) && File.readable?(file_name) + rescue + raise "Given file '#{file_name}' is not a file or not readable. Rake tasks take filename arguments like this: rake wordpress:full_import['/path/to/my_file']" + end file = File.open(file_name) if file.size >= 10485760 # 10MB - puts "WARNING: LibXML by default supports 10MB max file size. On some systems your file will be silently truncated; on others, an error will be raised. Consider splitting your file into smaller chunks, or double-checking the import results." + puts "WARNING: LibXML by default supports 10MB max file size. On some systems your file will be silently truncated; on others, an error will be raised. Consider splitting your file into smaller chunks and running rake tasks individually (authors, then blog/pages, then media), and double-check the import results." end @doc = Nokogiri::XML(file)