| 
<?php
 // should be good enough for names in fernch bibliographies although it doesn't do stop words
 // ex. AUSTRALIAN GOVERNMENT DEPARTMENT OF HEALTH AND AGEING => Australian Government Department Of Health And Ageing
 
 $file = 'C:\wamp\www\sweeper\not-swept\im-014-fr.html';
 $contents = file_get_contents($file);
 
 preg_match_all('/<span style="text-transform:uppercase">(.*?)<\/span>/is', $contents, $upperclass_span_matches, PREG_OFFSET_CAPTURE);
 $counter = sizeof($upperclass_span_matches[0]) - 1;
 print('<table>');
 while($counter > -1) {
 $span_content = $upperclass_span_matches[1][$counter][0];
 $span_offset = $upperclass_span_matches[0][$counter][1];
 print('<tr>
 <th align="left">' . $span_content . '</th>
 <td>');
 $counter2 = 0;
 $parsing_word = false;
 $parsing_characer_entity = false;
 $parsing_mac = false;
 $possibly_parsing_mac = false;
 $new_span_content = '';
 while($counter2 < strlen($span_content)) {
 if($parsing_characer_entity) {
 if($span_content[$counter2] === ';') {
 $parsing_characer_entity = false;
 }
 $new_span_content .= strtolower($span_content[$counter2]); // notice that the intention is for character entities to also be converted to lower class by this
 } else {
 if($span_content[$counter2] === '&') {
 $parsing_characer_entity = true;
 $new_span_content .= $span_content[$counter2];
 } else {
 if(!$parsing_word) {
 if($span_content[$counter2] === 'M' || $span_content[$counter2] === 'm') {
 $possibly_parsing_mac = true;
 $parsing_word = true;
 $new_span_content .= strtoupper($span_content[$counter2]);
 } elseif(preg_match('/[A-Z]/is', $span_content[$counter2])) {
 $parsing_word = true;
 $new_span_content .= strtoupper($span_content[$counter2]);
 } else {
 $new_span_content .= strtolower($span_content[$counter2]);
 }
 } else {
 if($possibly_parsing_mac) {
 if($span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') {
 $parsing_mac = true;
 } elseif(!preg_match('/[A-Z]/is', $span_content[$counter2])) {
 $parsing_word = false;
 }
 $new_span_content .= strtolower($span_content[$counter2]);
 $possibly_parsing_mac = false;
 } elseif($parsing_mac) {
 $new_span_content .= strtoupper($span_content[$counter2]);
 $parsing_mac = false;
 } else {
 if($span_content[$counter2] === 'C' || $span_content[$counter2] === 'c') {
 if($possibly_parsing_mac) {
 $parsing_mac = true;
 $possibly_parsing_mac = false;
 }
 } elseif(!preg_match('/[A-Z]/is', $span_content[$counter2])) {
 $parsing_word = false;
 }
 $new_span_content .= strtolower($span_content[$counter2]);
 }
 }
 }
 }
 $counter2++;
 }
 print($new_span_content . '</td>
 </tr>
 ');
 $contents = substr($contents, 0, $span_offset) . '<span style="text-transform:uppercase">' . $new_span_content . '</span>' . substr($contents, $span_offset + strlen($upperclass_span_matches[0][$counter][0]));
 $counter--;
 }
 print('</table>');
 
 file_put_contents($file, $contents);
 
 ?>
 |