From Monket
Usage
Runs using the command line version of PHP.
Converts whatever text is given as Standard Input and returns it as Standard Output.
Code
<?php // -*- coding: utf-8 -*-
/*
Replaces Macrons (and other stuff) with HTML entities.
Converts non-ASCII characters to their HTML entity equivalents.
This includes Macrons, curly quotes, en- and em-dashes, and copyright symbols.
Also converts Word list markers to '*' characters, so that lists are in
a Markdown format.
*/
// Extra encodings not included in the default mb_convert_encodings
$encodings = array(
'–' => '–',
'—' => '—',
'...' => '…',
'…' => '…',
'“' => '“',
'”' => '”',
'‘' => '‘',
'’' => '’',
'©' => '©',
'®' => '®',
"·\t" => "*\t", // When pasting lists from word they use this bullet, convert to star which markdown will process as list
"§\t" => "*\t", // When pasting lists from word they use this bullet, convert to star which markdown will process as list
);
// For each line in the file, first do my encodings, then built in PHP encodings
while (!feof(STDIN)) {
$buffer = fgets(STDIN, 4096);
$buffer = strtr($buffer, $encodings);
$buffer = mb_convert_encoding($buffer, 'HTML-ENTITIES', 'UTF-8');
echo $buffer;
}
?>