1 zerohalo Mar 25, 2005 19:21
3 zerohalo Mar 25, 2005 23:34
Thanks for the info. You're right, it's not the answer I wanted to hear :-). But it's good to know what's doable and not doable.
Do I understand correctly that the Renderer plugins take the text from the post and manipulate it before it's posted? If that's the case, then someone could write a PHP script that searches and replaces those troublesome characters?
(Not to put b2evo down, but WordPress doesn't have this same problem, and it's also written in PHP. Actually, it's very similar to b2evo in many respects. Perhaps we could use some code from there to fix this? If I was a coder I could probably figure it out myself, but alas, am I not.)
4 zerohalo Mar 25, 2005 23:44
Here's some code I found in WordPress, which, if I'm looking at it right, "sanitizes" the post to avoid the kind of error generated by b2evo with invalid characters.
Is there some way to use/adapt some of these functions in b2evo?
function wptexturize($text) {
$output = '';
// Capture tags and everything inside them
$textarr = preg_split("/(<.*>)/Us", $text, -1, PREG_SPLIT_DELIM_CAPTURE);
$stop = count($textarr); $next = true; // loop stuff
for ($i = 0; $i < $stop; $i++) {
$curl = $textarr[$i];
if (isset($curl{0}) && '<' != $curl{0} && $next) { // If it's not a tag
$curl = str_replace('---', '—', $curl);
$curl = str_replace('--', '–', $curl);
$curl = str_replace("...", '…', $curl);
$curl = str_replace('``', '“', $curl);
// This is a hack, look at this more later. It works pretty well though.
$cockney = array("'tain't","'twere","'twas","'tis","'twill","'til","'bout","'nuff","'round");
$cockneyreplace = array("’tain’t","’twere","’twas","’tis","’twill","’til","’bout","’nuff","’round");
$curl = str_replace($cockney, $cockneyreplace, $curl);
$curl = preg_replace("/'s/", '’s', $curl);
$curl = preg_replace("/'(\d\d(?:’|')?s)/", "’$1", $curl);
$curl = preg_replace('/(\s|\A|")\'/', '$1‘', $curl);
$curl = preg_replace('/(\d+)"/', '$1″', $curl);
$curl = preg_replace("/(\d+)'/", '$1′', $curl);
$curl = preg_replace("/(\S)'([^'\s])/", "$1’$2", $curl);
$curl = preg_replace('/(\s|\A)"(?!\s)/', '$1“$2', $curl);
$curl = preg_replace('/"(\s|\Z)/', '”$1', $curl);
$curl = preg_replace("/'([\s.]|\Z)/", '’$1', $curl);
$curl = preg_replace("/\(tm\)/i", '™', $curl);
$curl = preg_replace("/\(c\)/i", '©', $curl);
$curl = preg_replace("/\(r\)/i", '®', $curl);
$curl = str_replace("''", '”', $curl);
$curl = preg_replace('/(d+)x(\d+)/', "$1×$2", $curl);
} elseif (strstr($curl, '<code') || strstr($curl, '<pre') || strstr($curl, '<kbd' || strstr($curl, '<style') || strstr($curl, '<script'))) {
// strstr is fast
$next = false;
} else {
$next = true;
}
$output .= $curl;
}
return $output;
}
function clean_pre($text) {
$text = stripslashes($text);
$text = str_replace('<br />', '', $text);
return $text;
}
function wpautop($pee, $br = 1) {
$pee = $pee . "\n"; // just to make things a little easier, pad the end
$pee = preg_replace('|<br />\s*<br />|', "\n\n", $pee);
// Space things out a little
$pee = preg_replace('!(<(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])[^>]*>)!', "\n$1", $pee);
$pee = preg_replace('!(</(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])>)!', "$1\n", $pee);
$pee = str_replace(array("\r\n", "\r"), "\n", $pee); // cross-platform newlines
$pee = preg_replace("/\n\n+/", "\n\n", $pee); // take care of duplicates
$pee = preg_replace('/\n?(.+?)(?:\n\s*\n|\z)/s', "\t<p>$1</p>\n", $pee); // make paragraphs, including one at the end
$pee = preg_replace('|<p>\s*?</p>|', '', $pee); // under certain strange conditions it could create a P of entirely whitespace
$pee = preg_replace('!<p>\s*(</?(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])[^>]*>)\s*</p>!', "$1", $pee); // don't pee all over a tag
$pee = preg_replace("|<p>(<li.+?)</p>|", "$1", $pee); // problem with nested lists
$pee = preg_replace('|<p><blockquote([^>]*)>|i', "<blockquote$1><p>", $pee);
$pee = str_replace('</blockquote></p>', '</p></blockquote>', $pee);
$pee = preg_replace('!<p>\s*(</?(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])[^>]*>)!', "$1", $pee);
$pee = preg_replace('!(</?(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])[^>]*>)\s*</p>!', "$1", $pee);
if ($br) $pee = preg_replace('|(?<!<br />)\s*\n|', "<br />\n", $pee); // optionally make line breaks
$pee = preg_replace('!(</?(?:table|thead|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|form|blockquote|math|p|h[1-6])[^>]*>)\s*<br />!', "$1", $pee);
$pee = preg_replace('!<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)>)!', '$1', $pee);
$pee = preg_replace('!(<pre.*?>)(.*?)</pre>!ise', " stripslashes('$1') . clean_pre('$2') . '</pre>' ", $pee);
return $pee;
}
function seems_utf8($Str) { # by bmorel at ssi dot fr
for ($i=0; $i<strlen($Str); $i++) {
if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
else return false; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
return false;
}
}
return true;
}
function remove_accents($string) {
$chars['in'] = chr(128).chr(131).chr(138).chr(142).chr(154).chr(158)
.chr(159).chr(162).chr(165).chr(181).chr(192).chr(193).chr(194)
.chr(195).chr(196).chr(197).chr(199).chr(200).chr(201).chr(202)
.chr(203).chr(204).chr(205).chr(206).chr(207).chr(209).chr(210)
.chr(211).chr(212).chr(213).chr(214).chr(216).chr(217).chr(218)
.chr(219).chr(220).chr(221).chr(224).chr(225).chr(226).chr(227)
.chr(228).chr(229).chr(231).chr(232).chr(233).chr(234).chr(235)
.chr(236).chr(237).chr(238).chr(239).chr(241).chr(242).chr(243)
.chr(244).chr(245).chr(246).chr(248).chr(249).chr(250).chr(251)
.chr(252).chr(253).chr(255);
$chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";
if (seems_utf8($string)) {
$invalid_latin_chars = array(chr(197).chr(146) => 'OE', chr(197).chr(147) => 'oe', chr(197).chr(160) => 'S', chr(197).chr(189) => 'Z', chr(197).chr(161) => 's', chr(197).chr(190) => 'z', chr(226).chr(130).chr(172) => 'E');
$string = utf8_decode(strtr($string, $invalid_latin_chars));
}
$string = strtr($string, $chars['in'], $chars['out']);
$double_chars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
$double_chars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
$string = str_replace($double_chars['in'], $double_chars['out'], $string);
return $string;
}
function sanitize_title($title) {
$title = do_action('sanitize_title', $title);
return $title;
}
function sanitize_title_with_dashes($title) {
$title = remove_accents($title);
$title = strtolower($title);
$title = preg_replace('/&.+?;/', '', $title); // kill entities
$title = preg_replace('/[^a-z0-9 _-]/', '', $title);
$title = preg_replace('/\s+/', ' ', $title);
$title = str_replace(' ', '-', $title);
$title = preg_replace('|-+|', '-', $title);
$title = trim($title, '-');
return $title;
}
function convert_chars($content, $flag = 'obsolete') {
global $wp_htmltranswinuni;
// Remove metadata tags
$content = preg_replace('/<title>(.+?)<\/title>/','',$content);
$content = preg_replace('/<category>(.+?)<\/category>/','',$content);
// Converts lone & characters into & (a.k.a. &)
$content = preg_replace('/&([^#])(?![a-z]{1,8};)/i', '&$1', $content);
// Fix Word pasting
$content = strtr($content, $wp_htmltranswinuni);
// Just a little XHTML help
$content = str_replace('<br>', '<br />', $content);
$content = str_replace('<hr>', '<hr />', $content);
return $content;
}
those troublesome characters arent going to be easily gotten rid of .. the best advice is to let people know not to paste from Word docs. its probably not the answer you want, but its the answer.