prog/boffice/_class/parsingHtml.class.php

520 lines
18 KiB
PHP

<?php
/**
* HTML2PDF Librairy - parsingHtml class
*
* HTML => PDF convertor
* distributed under the LGPL License
*
* @author Laurent MINGUET <webmaster@html2pdf.fr>
* @version 4.03
*/
class HTML2PDF_parsingHtml
{
protected $_html = ''; // HTML code to parse
protected $_num = 0; // table number
protected $_level = 0; // table level
protected $_encoding = ''; // encoding
public $code = array(); // parsed HTML codfe
const HTML_TAB = ' ';
/**
* main constructor
*
* @param string encoding
* @access public
*/
public function __construct($encoding = 'UTF-8')
{
$this->_num = 0;
$this->_level = array($this->_num);
$this->_html = '';
$this->code = array();
$this->setEncoding($encoding);
}
/**
* change the encoding
*
* @param string encoding
* @access public
*/
public function setEncoding($encoding)
{
$this->_encoding = $encoding;
}
/**
* Define the HTML code to parse
*
* @param string HTML code
* @access public
*/
public function setHTML($html)
{
// remove the HTML in comment
$html = preg_replace('/<!--(.*)-->/isU', '', $html);
// save the HTML code
$this->_html = $html;
}
/**
* parse the HTML code
*
* @access public
*/
public function parse()
{
$parents = array();
// flag : are we in a <pre> Tag ?
$tagPreIn = false;
// action to use for each line of the content of a <pre> Tag
$tagPreBr = array(
'name' => 'br',
'close' => false,
'param' => array(
'style' => array(),
'num' => 0
)
);
// tag that can be not closed
$tagsNotClosed = array(
'br', 'hr', 'img', 'col',
'input', 'link', 'option',
'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
);
// search the HTML tags
$tmp = array();
$this->_searchCode($tmp);
// all the actions to do
$actions = array();
// foreach part of the HTML code
foreach ($tmp as $part) {
// if it is a tag code
if ($part[0]=='code') {
// analise the HTML code
$res = $this->_analiseCode($part[1]);
// if it is a real HTML tag
if ($res) {
// save the current posistion in the HTML code
$res['html_pos'] = $part[2];
// if the tag must be closed
if (!in_array($res['name'], $tagsNotClosed)) {
// if it is a closure tag
if ($res['close']) {
// HTML validation
if (count($parents)<1)
throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
else if ($parents[count($parents)-1]!=$res['name'])
throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
else
unset($parents[count($parents)-1]);
} else {
// if it is a autoclosed tag
if ($res['autoclose']) {
// save the opened tag
$actions[] = $res;
// prepare the closed tag
$res['params'] = array();
$res['close'] = true;
}
// else :add a child for validation
else
$parents[count($parents)] = $res['name'];
}
// if it is a <pre> tag (or <code> tag) not auclosed => update the flag
if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) {
$tagPreIn = !$res['close'];
}
}
// save the actions to convert
$actions[] = $res;
} else { // else (it is not a real HTML tag => we transform it in Texte
$part[0]='txt';
}
}
// if it is text
if ($part[0]=='txt') {
// if we are not in a <pre> tag
if (!$tagPreIn) {
// save the action
$actions[] = array(
'name' => 'write',
'close' => false,
'param' => array('txt' => $this->_prepareTxt($part[1])),
);
} else { // else (if we are in a <pre> tag)
// prepare the text
$part[1] = str_replace("\r", '', $part[1]);
$part[1] = explode("\n", $part[1]);
// foreach line of the text
foreach ($part[1] as $k => $txt) {
// transform the line
$txt = str_replace("\t", self::HTML_TAB, $txt);
$txt = str_replace(' ', '&nbsp;', $txt);
// add a break line
if ($k>0) $actions[] = $tagPreBr;
// save the action
$actions[] = array(
'name' => 'write',
'close' => false,
'param' => array('txt' => $this->_prepareTxt($txt, false)),
);
}
}
}
}
// for each indentified action, we have to clean up the begin and the end of the texte
// based on tags that surround it
// list of the tags to clean
$tagsToClean = array(
'page', 'page_header', 'page_footer', 'form',
'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
'div', 'hr', 'p', 'ul', 'ol', 'li',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'bookmark', 'fieldset', 'legend',
'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
'option'
);
// foreach action
$nb = count($actions);
for ($k=0; $k<$nb; $k++) {
// if it is a Text
if ($actions[$k]['name']=='write') {
// if the tag before the text is a tag to clean => ltrim on the text
if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean))
$actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
// if the tag after the text is a tag to clean => rtrim on the text
if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean))
$actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
// if the text is empty => remove the action
if (!strlen($actions[$k]['param']['txt']))
unset($actions[$k]);
}
}
// if we are not on the level 0 => HTML validator ERROR
if (count($parents)) throw new HTML2PDF_exception(5, $parents);
// save the actions to do
$this->code = array_values($actions);
}
/**
* prepare the text
*
* @param string texte
* @param boolean true => replace multiple space+\t+\r+\n by a single space
* @return string texte
* @access protected
*/
protected function _prepareTxt($txt, $spaces = true)
{
if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt);
$txt = str_replace('&euro;', '€', $txt);
$txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
return $txt;
}
/**
* parse the HTML code
*
* @param &array array's result
* @return null
*/
protected function _searchCode(&$tmp)
{
// initialise the array
$tmp = array();
// regexp to separate the tags from the texts
$reg = '/(<[^>]+>)|([^<]+)+/isU';
// last match found
$str = '';
$offset = 0;
// As it finds a match
while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
// if it is a tag
if ($parse[1][0]) {
// save the previous text if it exists
if ($str!=='') $tmp[] = array('txt', $str);
// save the tag, with the offset
$tmp[] = array('code', trim($parse[1][0]), $offset);
// init the current text
$str = '';
} else { // else (if it is a text)
// add the new text to the current text
$str.= $parse[2][0];
}
// Update offset to the end of the match
$offset = $parse[0][1] + strlen($parse[0][0]);
unset($parse);
}
// if a text is present in the end, we save it
if ($str!='') $tmp[] = array('txt', $str);
unset($str);
}
/**
* analise a HTML tag
*
* @param string HTML code to analise
* @return array corresponding action
*/
protected function _analiseCode($code)
{
// name of the tag, opening, closure, autoclosure
$tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
if (!preg_match('/'.$tag.'/isU', $code, $match)) return null;
$close = ($match[1]=='/' ? true : false);
$autoclose = preg_match('/\/>$/isU', $code);
$name = strtolower($match[2]);
// required parameters (depends on the tag name)
$param = array();
$param['style'] = '';
if ($name=='img') {
$param['alt'] = '';
$param['src'] = '';
}
if ($name=='a') {
$param['href'] = '';
}
// read the parameters : nom=valeur
$prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
preg_match_all('/'.$prop.'/is', $code, $match);
for($k=0; $k<count($match[0]); $k++)
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
// read the parameters : nom="valeur"
$prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
preg_match_all('/'.$prop.'/is', $code, $match);
for($k=0; $k<count($match[0]); $k++)
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
// read the parameters : nom='valeur'
$prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
preg_match_all('/'.$prop.'/is', $code, $match);
for($k=0; $k<count($match[0]); $k++)
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
// compliance of each parameter
$color = "#000000";
$border = null;
foreach ($param as $key => $val) {
$key = strtolower($key);
switch($key)
{
case 'width':
unset($param[$key]);
$param['style'] .= 'width: '.$val.'px; ';
break;
case 'align':
if ($name==='img') {
unset($param[$key]);
$param['style'] .= 'float: '.$val.'; ';
} elseif ($name!=='table') {
unset($param[$key]);
$param['style'] .= 'text-align: '.$val.'; ';
}
break;
case 'valign':
unset($param[$key]);
$param['style'] .= 'vertical-align: '.$val.'; ';
break;
case 'height':
unset($param[$key]);
$param['style'] .= 'height: '.$val.'px; ';
break;
case 'bgcolor':
unset($param[$key]);
$param['style'] .= 'background: '.$val.'; ';
break;
case 'bordercolor':
unset($param[$key]);
$color = $val;
break;
case 'border':
unset($param[$key]);
if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px';
$border = $val;
break;
case 'cellpadding':
case 'cellspacing':
if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px';
break;
case 'colspan':
case 'rowspan':
$val = preg_replace('/[^0-9]/isU', '', $val);
if (!$val) $val = 1;
$param[$key] = $val;
break;
}
}
// compliance of the border
if ($border!==null) {
if ($border) $border = 'border: solid '.$border.' '.$color;
else $border = 'border: none';
$param['style'] .= $border.'; ';
$param['border'] = $border;
}
// reading styles: decomposition and standardization
$styles = explode(';', $param['style']);
$param['style'] = array();
foreach ($styles as $style) {
$tmp = explode(':', $style);
if (count($tmp)>1) {
$cod = $tmp[0];
unset($tmp[0]);
$tmp = implode(':', $tmp);
$param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
}
}
// determining the level of table opening, with an added level
if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
$this->_num++;
$this->_level[count($this->_level)] = $this->_num;
}
// get the level of the table containing the element
if (!isset($param['num'])) {
$param['num'] = $this->_level[count($this->_level)-1];
}
// for closures table: remove a level
if (in_array($name, array('ul', 'ol', 'table')) && $close) {
unset($this->_level[count($this->_level)-1]);
}
// prepare the parameters
if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
// return the new action to do
return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
}
/**
* get a full level of HTML, between an opening and closing corresponding
*
* @param integer key
* @return array actions
*/
public function getLevel($k)
{
// if the code does not exist => return empty
if (!isset($this->code[$k])) return array();
// the tag to detect
$detect = $this->code[$k]['name'];
// if it is a text => return
if ($detect=='write') {
return array($this->code[$k]);
}
//
$level = 0; // depth level
$end = false; // end of the search
$code = array(); // extract code
// while it's not ended
while (!$end) {
// current action
$row = $this->code[$k];
// if 'write' => we add the text
if ($row['name']=='write') {
$code[] = $row;
} else { // else, it is a html tag
$not = false; // flag for not taking into account the current tag
// if it is the searched tag
if ($row['name']==$detect) {
// if we are just at the root level => dont take it
if ($level==0) {
$not = true;
}
// update the level
$level+= ($row['close'] ? -1 : 1);
// if we are now at the root level => it is the end, and dont take it
if ($level==0) {
$not = true;
$end = true;
}
}
// if we can takin into account the current tag => save it
if (!$not) {
if (isset($row['style']['text-align'])) unset($row['style']['text-align']);
$code[] = $row;
}
}
// it continues as long as there has code to analise
if (isset($this->code[$k+1]))
$k++;
else
$end = true;
}
// return the extract
return $code;
}
/**
* return a part of the HTML code, for error message
*
* @param integer position
* @param integer take before
* @param integer take after
* @return string part of the html code
*/
public function getHtmlErrorCode($pos, $before=30, $after=40)
{
return substr($this->_html, $pos-$before, $before+$after);
}
}