--- author: email: mail@petermolnar.net image: https://petermolnar.net/favicon.jpg name: Peter Molnar url: https://petermolnar.net copies: - http://web.archive.org/web/20160314105324/http://petermolnar.eu/linux-tech-coding/wordpress-cleanup-markdown-import-twitter-import-facebook lang: en published: '2014-07-18T22:50:26+00:00' summary: 'THA Big Bad Cleanup of my WordPress: from importing tweets & statuses to posting in Markdown.' tags: - WordPress title: 'Trimming the fat: THA Big Bad Cleanup of my WordPress' --- About two *(or more)* months ago I decided to pull myself together from the shards[^1]: - import ~~all~~ my important Tweets *( thank God altogether there's less than 800 )* - import important Facebook updates & posted photos - dig up all the saved version of all my previous sites and get most of the posts from there - remove unnecessary plugins - clean up leftover shortcodes, old markup, etc. from old posts ## Importing ### Twitter Someone thought about exporting tweets in a usable manner: you'll receive a csv version of all your tweets when you go to Settings[^2] and click **Request your archive**. All you need is WP Ultimate CSV Importer[^3] and you can even import the retweet and reply metadata from the csv file; although you might need a bit of a magic to replace the links with the expanded ones instead of the shortlinks. ### Facebook On the other hand, Facebook is terrible. The export you receive is an HTML file, the entries aren't in a block but separated with `
]*? # Match anything other than '>', Zero or More times, not-greedy (wont eat the /) (\/?) # Capture Group $2 - '/' if it is there > # Match '>' /i # End Pattern - Case Insensitive * */ //$content = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $content); /** * replace
, , [code] and [cc]
     */

    if ( strstr( $content, '
' )) {
        $s = array ( '
', '
' ); $r = array ( "```\n", "\n```" ); $content = str_replace ( $s, $r, $content ); } if ( strstr( $content, '
' )) {
        $s = array ( '
', '
' ); $r = array ( "```\n", "\n```" ); $content = str_replace ( $s, $r, $content ); } if ( strstr( $content, '
' )) { $s = array ( '', '' ); $r = array ( "```\n", "\n```" ); $content = str_replace ( $s, $r, $content ); } // straigtforward formatting: html to markdown $s = array ( '', '', '', '', '', '', '', '', '', '' ); $r = array ( '`', '`', '**', '**', '**', '**', '*', '*', '*', '*' ); $content = str_replace ( $s, $r, $content ); $s = array ( '

','

', '
', '
', '

', '

', '

', '

','

', '

','

', '

','
', '
','
', '
', '
', '
' ); $r = array ( "\n", "\n", "\n", "\n", '#', '', '## ', '', '### ', '', '#### ', '', '##### ', '', '###### ', '', '> ', '' ); $content = str_replace ( $s, $r, $content ); preg_match_all('/
    (.*?)< \/ul>/s', $content, $uls); if ( !empty ( $uls[0] ) ) { foreach ( $uls[0] as $to_replace ) { $to_clean = preg_replace ( '/\t
  • /', '- ', $to_replace ); $s = array ( '
  • ', '
    ', '
', '
  • ' ); $r = array ( '', '', '', '- ' ); $to_clean = str_replace ( $s, $r, $to_clean ); $content = str_replace ( $to_replace, $to_clean, $content ); } } preg_match_all('/
      (.*?)< \/ol>/s', $content, $ols); if ( !empty ( $ols[0] ) ) { foreach ( $ols[0] as $to_replace ) { $to_clean = $to_replace; preg_match_all('/
    1. (.*?)< \/li>/s', $to_clean, $lis); foreach ( $lis[0] as $id=>$lis_replace ) { $liline = $lis_replace; $lis_replace = preg_replace ( '/\t
    2. /', $id+1 . '. ', $lis_replace ); $lis_replace = preg_replace ( '/
    3. /', $id+1 . '. ', $lis_replace ); $to_clean = str_replace ( $liline , $lis_replace, $to_clean ); } $content = str_replace ( $to_replace, $to_clean, $content ); } } $s = array ( '
        ', '
      ', '
    4. ' ); $r = array ( '', '', '' ); $content = str_replace ( $s, $r, $content ); preg_match_all('/
      (.*?)< \/dl>/s', $content, $dl); if ( !empty ( $dl[0] ) ) { foreach ( $dl[0] as $to_replace ) { $to_clean = $to_replace; preg_match_all('/
      (.*?)< \/dt>/s', $to_clean, $dts); preg_match_all('/
      (.*?)< \/dd>/s', $to_clean, $dds); foreach ( $dts[0] as $id=>$dt ) { $o_dt = $dt; $o_dd = $dds[0][$id]; $dt = str_replace ( array('
      ', '
      ' ), array( "" , "\n" ), $dt ); } } } $c = str_get_html ( $content ); if (!$c) return $content; // find links foreach($c->find('a') as $a) { $out = $href = $title = $txt = ''; $href = $a->href; $title = $a->title; $txt = $a->innertext; if ( !empty( $txt ) && !empty ( $href ) ) { if (!empty($title)) $out = '['. $txt .' '.$title.']('. $href .')'; else $out = '['. $txt .']('. $href .')'; $content = str_replace ( $a->outertext, $out, $content ); } } // clean up images: foreach($c->find('img') as $img) { $src = $alt = $title = $cl = $out = false; $src = $img->src; $alt = $img->alt; $title = $img->title; if ( empty($alt) && !empty($title) ) $alt = $title; if ( empty($alt) ) $alt = $src; $img = '!['.$alt.']('. $src; if ( !empty($title) ) $img .= ' '. $title; $img .= ')'; $content = str_replace ( $img->outertext, $img, $content ); } // fix potential hashtag issues $content = preg_replace ( '/^#/mi', '\#', $content ); wp_cache_set ( $hash, $content, __CLASS__ . __FUNCTION__, static::expire ); return $content; } ``` So far, so good. Some plugins are not playing that well with the Markup-only text I'm now saving into, but I'll probably just post patches to them for this. [^1]: [^2]: [^3]: [^4]: [^5]: [^6]: [^7]: [^8]: [^9]: [^10]: [^11]: [^12]: