This repository has been archived on 2024-03-20. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
bboschecker/tableExtractor.class.php
2024-03-20 09:29:18 -05:00

383 lines
No EOL
9.9 KiB
PHP

<?php
/*----------------------------------------------------------------------
Table Extractor
===============
Table extractor is a php class that can extract almost any table
from any html document/page, and then convert that html table into
a php array.
Version 1.3
Compatibility: PHP 4.4.1 +
Copyright Jack Sleight - www.reallyshiny.com
This script is licensed under the Creative Commons License.
----------------------------------------------------------------------*/
class tableExtractor {
var $source = NULL;
var $anchor = NULL;
var $anchorWithin = false;
var $headerRow = true;
var $startRow = 1;
var $maxRows = 0;
var $startCol = 1;
var $maxCols = 0;
var $stripTags = false;
var $extraCols = array();
var $rowCount = 0;
var $dropRows = NULL;
var $cleanHTML = NULL;
var $rawArray = NULL;
var $finalArray = NULL;
/*--------------------------------------------------
--------------------------------------------------*/
function extractTable() {
$this->cleanHTML();
$this->prepareArray();
return $this->createArray();
}
/*--------------------------------------------------
--------------------------------------------------*/
function cleanHTML() {
// php 4 compatibility functions
if(!function_exists('stripos')) {
function stripos($haystack,$needle,$offset = 0) {
return(strpos(strtolower($haystack),strtolower($needle),$offset));
}
}
// find unique string that appears before the table you want to extract
if ($this->anchorWithin) {
/*------------------------------------------------------------
With thanks to Khary Sharp for suggesting and writing
the anchor within functionality.
------------------------------------------------------------*/
$anchorPos = stripos($this->source, $this->anchor) + strlen($this->anchor);
$sourceSnippet = strrev(substr($this->source, 0, $anchorPos));
$tablePos = stripos($sourceSnippet, strrev(("<table"))) + 6;
$startSearch = strlen($sourceSnippet) - $tablePos;
}
else {
$startSearch = stripos($this->source, $this->anchor);
}
// extract table
$startTable = stripos($this->source, '<table', $startSearch);
$endTable = stripos($this->source, '</table>', $startTable) + 8;
$table = substr($this->source, $startTable, $endTable - $startTable);
if(!function_exists('lcase_tags')) {
function lcase_tags($input) {
return strtolower($input[0]);
}
}
// lowercase all table related tags
$table = preg_replace_callback('/<(\/?)(table|tr|th|td)/is', 'lcase_tags', $table);
// remove all thead and tbody tags
$table = preg_replace('/<\/?(thead|tbody).*?>/is', '', $table);
// replace th tags with td tags
$table = preg_replace('/<(\/?)th(.*?)>/is', '<$1td$2>', $table);
// clean string
$table = trim($table);
$table = str_replace("\r\n", "", $table);
$this->cleanHTML = $table;
}
/*--------------------------------------------------
--------------------------------------------------*/
function prepareArray() {
// split table into individual elements
$pattern = '/(<\/?(?:tr|td).*?>)/is';
$table = preg_split($pattern, $this->cleanHTML, -1, PREG_SPLIT_DELIM_CAPTURE);
// define array for new table
$tableCleaned = array();
// define variables for looping through table
$rowCount = 0;
$colCount = 1;
$trOpen = false;
$tdOpen = false;
// loop through table
foreach($table as $item) {
// trim item
$item = str_replace('&nbsp;', '', $item);
$item = trim($item);
// save the item
$itemUnedited = $item;
// clean if tag
$item = preg_replace('/<(\/?)(table|tr|td).*?>/is', '<$1$2>', $item);
// pick item type
switch ($item) {
case '<tr>':
// start a new row
$rowCount++;
$colCount = 1;
$trOpen = true;
break;
case '<td>':
// save the td tag for later use
$tdTag = $itemUnedited;
$tdOpen = true;
break;
case '</td>':
$tdOpen = false;
break;
case '</tr>':
$trOpen = false;
break;
default :
// if a TD tag is open
if($tdOpen) {
// check if td tag contained colspan
if(preg_match('/<td [^>]*colspan\s*=\s*(?:\'|")?\s*([0-9]+)[^>]*>/is', $tdTag, $matches))
$colspan = $matches[1];
else
$colspan = 1;
// check if td tag contained rowspan
if(preg_match('/<td [^>]*rowspan\s*=\s*(?:\'|")?\s*([0-9]+)[^>]*>/is', $tdTag, $matches))
$rowspan = $matches[1];
else
$rowspan = 0;
// loop over the colspans
for($c = 0; $c < $colspan; $c++) {
// if the item data has not already been defined by a rowspan loop, set it
if(!isset($tableCleaned[$rowCount][$colCount]))
$tableCleaned[$rowCount][$colCount] = $item;
else
$tableCleaned[$rowCount][$colCount + 1] = $item;
// create new rowCount variable for looping through rowspans
$futureRows = $rowCount;
// loop through row spans
for($r = 1; $r < $rowspan; $r++) {
$futureRows++;
if($colspan > 1)
$tableCleaned[$futureRows][$colCount + 1] = $item;
else
$tableCleaned[$futureRows][$colCount] = $item;
}
// increase column count
$colCount++;
}
// sort the row array by the column keys (as inserting rowspans screws up the order)
ksort($tableCleaned[$rowCount]);
}
break;
}
}
// set row count
if($this->headerRow)
$this->rowCount = count($tableCleaned) - 1;
else
$this->rowCount = count($tableCleaned);
$this->rawArray = $tableCleaned;
}
/*--------------------------------------------------
--------------------------------------------------*/
function createArray() {
// define array to store table data
$tableData = array();
// get column headers
if($this->headerRow) {
// trim string
$row = $this->rawArray[$this->headerRow];
// set column names array
$columnNames = array();
$uniqueNames = array();
// loop over column names
$colCount = 0;
foreach($row as $cell) {
$colCount++;
$cell = strip_tags($cell);
$cell = trim($cell);
// save name if there is one, otherwise save index
if($cell) {
if(isset($uniqueNames[$cell])) {
$uniqueNames[$cell]++;
$cell .= ' ('.($uniqueNames[$cell] + 1).')';
}
else {
$uniqueNames[$cell] = 0;
}
$columnNames[$colCount] = $cell;
}
else
$columnNames[$colCount] = $colCount;
}
// remove the headers row from the table
unset($this->rawArray[$this->headerRow]);
}
// remove rows to drop
foreach(explode(',', $this->dropRows) as $key => $value) {
unset($this->rawArray[$value]);
}
// set the end row
if($this->maxRows)
$endRow = $this->startRow + $this->maxRows - 1;
else
$endRow = count($this->rawArray);
// loop over row array
$rowCount = 0;
$newRowCount = 0;
foreach($this->rawArray as $row) {
$rowCount++;
// if the row was requested then add it
if($rowCount >= $this->startRow && $rowCount <= $endRow) {
$newRowCount++;
// create new array to store data
$tableData[$newRowCount] = array();
//$tableData[$newRowCount]['origRow'] = $rowCount;
//$tableData[$newRowCount]['data'] = array();
$tableData[$newRowCount] = array();
// set the end column
if($this->maxCols)
$endCol = $this->startCol + $this->maxCols - 1;
else
$endCol = count($row);
// loop over cell array
$colCount = 0;
$newColCount = 0;
foreach($row as $cell) {
$colCount++;
// if the column was requested then add it
if($colCount >= $this->startCol && $colCount <= $endCol) {
$newColCount++;
if($this->extraCols) {
foreach($this->extraCols as $extraColumn) {
if($extraColumn['column'] == $colCount) {
if(preg_match($extraColumn['regex'], $cell, $matches)) {
if(is_array($extraColumn['names'])) {
$this->extraColsCount = 0;
foreach($extraColumn['names'] as $extraColumnSub) {
$this->extraColsCount++;
$tableData[$newRowCount][$extraColumnSub] = $matches[$this->extraColsCount];
}
} else {
$tableData[$newRowCount][$extraColumn['names']] = $matches[1];
}
} else {
$this->extraColsCount = 0;
if(is_array($extraColumn['names'])) {
$this->extraColsCount = 0;
foreach($extraColumn['names'] as $extraColumnSub) {
$this->extraColsCount++;
$tableData[$newRowCount][$extraColumnSub] = '';
}
} else {
$tableData[$newRowCount][$extraColumn['names']] = '';
}
}
}
}
}
if($this->stripTags)
$cell = strip_tags($cell);
// set the column key as the column number
$colKey = $newColCount;
// if there is a table header, use the column name as the key
if($this->headerRow)
if(isset($columnNames[$colCount]))
$colKey = $columnNames[$colCount];
// add the data to the array
//$tableData[$newRowCount]['data'][$colKey] = $cell;
$tableData[$newRowCount][$colKey] = $cell;
}
}
}
}
$this->finalArray = $tableData;
return $tableData;
}
}
?>