Personal tools

Lhparser.php

From OrganicDesign Wiki

Jump to: navigation, search
<?
# Send input file through xpdf.pdftotext into tmp file and read into $file
$root=$_SERVER['DOCUMENT_ROOT'];
$pdf=$HTTP_GET_VARS['pdf'];
$tmp=tempnam($_ENV['tmp'],'xpdf');
$tmp=ereg_replace("pdf$","txt",$pdf);
@system("$root\\..\\xpdf\\pdftotext -layout $pdf $tmp");
@$file=file($tmp) or die("Couldn't read or parse pdf file - supply pdf=filename in query-string");
 
# Output table headings:
# 	if $html is present in query-string, output is rendered as a table in an HTML doc,
# 	otherwise output is a tab-separated list
$html=array_key_exists('html',$HTTP_GET_VARS);
if ($html) {?>
<html>
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style>
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black>
<tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?}
else {?>No.	Name	Position	Characteristic	Height	Range	Structure	Remarks	Type	Chart	Area
<?}
 
	# Extraction issues:
	# 	1. Assuming at least one empty line between entries
	#	2. Column spacing is different on each page
	#	3. Entries can have more than one number (same entry with multiple keys)
	#	4. Content can start on line before number (due to layout inaccuracy in xpdf parser)
	#	5. Position column header does not match content well, but position content is consistent
 
# _________________________________________________________________________________________________________________________________
#
# PARSE#1 - BUILD LAYOUT ARRAY
 
# Initialise page loop environment
$enlf="\n"; # chr used to separate multiple entry numbers
$separator="\t";
$last_line='';
$last_kind='';
$layout=array();
$pages=0;
$positions_found=array();
 
# Loop through lines, record column layout by page
foreach ($file as $line) {
 
	# Determine action based on line content
	if (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) $kind='page1';
	elseif (($last_kind=='page1')&&preg_match("/^ +No\\. +Name and Location +Position +Characteristic +Height +Range +Structure +Remarks/",$line)) {
 
		# Get most frequent location of Position content for cols 1-2 and 2-3 from page just done
		$max=0; $p=0; foreach ($positions_found as $k=>$v) if ($v>$max) {$max=$v; $p=$k;}
		$layout[$pages]['cols']['1-2']=$p;
		$layout[$pages]['cols']['2-3']=$p+12;
 
		$pages++;
 
		# Last four boundaries work well off header text position at start of this new page
		$layout[$pages]['cols']['3-4']=strpos($line,"Height")-1;
		$layout[$pages]['cols']['4-5']=strpos($line,"Range");
		$layout[$pages]['cols']['5-6']=strpos($line,"Range")+6;
		$layout[$pages]['cols']['6-7']=(strpos($last_line,"7")+strpos($last_line,"8"))/2;
 
		# Prepare for next page
		$kind=='page2';
		$positions_found=array();
		}
	elseif (preg_match("/^(.+)[0-9][0-9] [0-9][0-9] [0-9][0-9] [NW]/",$line,$m)) {
		# Position data found in this entry, record its location
		$positions_found[strlen($m[1])]++;
		}
	else $kind='general';
 
	# Prepare for next line iteration
	$last_line=$line;
	$last_kind=$kind;
	}
 
# _________________________________________________________________________________________________________________________________
#
# PARSE#2 - CONTENT EXTRACTION LOOP
 
# Initialise line-loop environment
$row=array('','','','','','','','');
$page=0;
$entries=array(''=>'dont render null rows');
$extracted=0;
$current_entry='';
$last_kind='';
$chart=0;
$area='';
 
# Loop through lines, extract & render content
foreach ($file as $line) {
 
	# Determine the kind of line from content
	$line=rtrim($line);
 
	$last_kind=$kind;
	if (preg_match("/^([0-9]+) +/",$line,$entry_match)) $kind='entry';	# Numbered entry
	elseif (str_replace(" ","",$line)=="") $kind='empty';				# Empty
	elseif (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) {
		# New page
		$page++;
		//print "<tr bgcolor=#cc9999><td>Debug Info:<br>Page $page";
		//foreach ($layout[$page]['cols'] as $k=>$v) print "<td>$k<br>$v";
		//print "<td>&nbsp;<td>&nbsp;<td>&nbsp;<td>&nbsp;</tr>\n";
		$kind='page';
		}
	elseif (preg_match("/\\(Chart ([0-9]+)\\)/",$line,$m)&&($current_entry=='')) {
		# Chart
		$kind='chart';
		$chart=$m[1];
		}
	elseif (($last_kind=='empty')&&(!ereg("SECTION",$line)&&!ereg("^[ 0-9]+$",$line)&&preg_match("/^ {30,80}([^ ].+)$/",$line,$m))) {
		# Area (type 1)
		if (!ereg(" {2,}",$m[1])) {
			$kind='area';
			$area=$m[1];
			}
		}
	else $kind='general';
 
 
	# _________________________________________________________________________________________________________________________________
	#
	# ACCUMULATE COLUMN CONTENT FOR CURRENT MULTILINE ROW
 
	# If line has an entry number, append to row[0] ($current_entry is the primary entry num only)
	if ($kind=='entry') {
		if ($current_entry) $row[0].=$enlf.$entry_match[1];
		else $row[0]=$current_entry=$entry_match[1];
		}
 
	# If line is not empty, but also not a page header, then extract content into current row
	if (($kind=='general')||($kind=='entry')) {
		# Remove entry num if any because already dealt with separately
		$tmp=ereg_replace("^([0-9]+)","",$line);
		# Loop through content a word at a time (incl. spaces before each word for position info)
		$current_position=strlen($line)-strlen($tmp); # Account for removed entry num
		preg_match_all("/( +[^ ]+)/",$tmp,$words);
		$last_col=-1;
		foreach ($words[1] as $sw) {
			# Get length of space and remove from word
			preg_match("/^( +)([^ ]+)$/",$sw,$m);
			$space=strlen($m[1]);
			$word=$m[2];
			# For each word assign to a column by which column word center falls within
			$pos=$current_position+$space+1+strlen($word)/2;
			if ($pos>$layout[$page]['cols']['6-7']) $col=7;
			elseif ($pos>$layout[$page]['cols']['5-6']) $col=6;
			elseif ($pos>$layout[$page]['cols']['4-5']) $col=5;
			elseif ($pos>$layout[$page]['cols']['3-4']) $col=4;
			elseif ($pos>$layout[$page]['cols']['2-3']) $col=3;
			elseif ($pos>$layout[$page]['cols']['1-2']) $col=2;
			else $col=1;
			# Append the word and its space to it's assigned column
			$current_position+=strlen($sw);
//			if (($last_col!=$col)&&(ereg("^[^a-z]",$word))) $sw="*\n"+trim($sw);
			if (($last_col!=$col)&&($row[$col])&&ereg("^ +[^ a-z]",$sw)) $sw=$enlf.trim($sw);
			else $sw=" ".trim($sw);
			$row[$col].=$sw;//."\{$pos}";
			//$row[$col]=$colpos['23'].','.$colpos['34'].','.$colpos['45'].','.$colpos['56'].','.$colpos['67'].','.$colpos['78'];
			$last_col=$col;
			}
		}
 
	# _________________________________________________________________________________________________________________________________
	#
	# RENDER AND CLEAR CURRENT ROW
 
	# If current line is empty, process current row content
 
	if ($kind=='empty') {
 
 
		# This primary-entry-number hasn't been marked as done, so render it now
		if (!array_key_exists($current_entry,$entries)) {
			# Post-process row (to get type from name col)
			if (preg_match("/buoy/i",$row[1])) $row[8]='Bouy';
			elseif (preg_match("/light([^e]|$)/i",$row[1])) $row[8]='Light';
			else $row[8]='';
			$row[9]=$chart;
			$row[10]=$area;
			# Render
if ($row[8]=='Light') {
			if ($html) {
				print "<tr valign=top>";
				foreach ($row as $col) {
					//$col=trim($col);
					if ($col=='') print '<td>&nbsp;</td>';
					else print '<td>'.ereg_replace("\n","<br>",htmlentities($col)).'</td>';
					}
				print "</tr>\n";
				} else print ereg_replace("\n","\\n",join($separator,$row))."\n";
}
			$extracted++;
			}
 
		# Clear row and prepare for new info
		foreach (split($enlf,$row[0]) as $i) $entries[$i]=1; # Mark all numbers of this entry as done
		//$entries[$current_entry]=true; # Mark just the primary number as done
		$current_entry='';
		$row=array('','','','','','','','');
		}
	}
 
# _________________________________________________________________________________________________________________________________
#
 
# Clean up and exit
if ($html) print "<tr><td colspan=11 align=center><font color=red>$pages pages found, $extracted entries extracted.</font></td></tr>";
//unlink($tmp);
if ($html) {?>
</table>
</html>
<?}?>

The GNU Project Debian Linux Ubuntu Linux Wikipedia online encycopedia MediaWiki