Parser mapa UKE + crawler

wtorek, 25 września 2012

Parser mapa UKE + crawler

Poniżej zamieszczam kod źródłowy aplikacji, którą napisałem do parsowania danych z mapy UKE mapa.uke.gov.pl.

Struktura bazy MySQL

mysql> explain mapa;
+----------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+----------------+--------------+------+-----+---------+-------+
| id | int(20) | NO | PRI | NULL | |
| miejsce | varchar(255) | NO | | NULL | |
| ogolem | int(20) | NO | | NULL | |
| indywidualni | int(20) | NO | | NULL | |
| biznesowi | int(20) | NO | | NULL | |
| t_ogolem | int(20) | NO | | NULL | |
| t_indywidualni | int(20) | NO | | NULL | |
| t_biznesowi | int(20) | NO | | NULL | |
| op_int | int(20) | NO | | NULL | |
| op_tel | int(20) | NO | | NULL | |
+----------------+--------------+------+-----+---------+-------+
10 rows in set (0.00 sec)

Plik func.php

<?php

function polacz()
{
$dbname='uke';
$dbhost='localhost';
$dbuser='root';
$dbpass='pass';

$link = mysql_connect($dbhost,$dbuser,$dbpass);
mysql_select_db($dbname);
return $link;
}

function tabulka($tabelka)
{
$dom = new DOMDocument();
//load the html
$html = $dom->loadHTMLFile($tabelka);

//discard white space
$dom->preserveWhiteSpace = false;

//the table by its tag name
$tables = $dom->getElementsByTagName('table');

//get all rows from the table
$rows = $tables->item(0)->getElementsByTagName('tr');
// get each column by tag name
$cols = $rows->item(0)->getElementsByTagName('th');
$row_headers = NULL;
foreach ($cols as $node) {
//print $node->nodeValue."\n";
$row_headers[] = $node->nodeValue;
}

$table = array();
//get all rows from the table
$rows = $tables->item(0)->getElementsByTagName('tr');
foreach ($rows as $row)
{
// get each column by tag name
$cols = $row->getElementsByTagName('td');
$row = array();
$i=0;
foreach ($cols as $node) {
# code...
//print $node->nodeValue."\n";
if($row_headers==NULL)
$row[] = $node->nodeValue;
else
$row[$row_headers[$i]] = $node->nodeValue;
$i++;
}
$table[] = $row;
}
return $table;
//var_dump($table);
}

function spider($id){
//echo $id."\n";
$id_miejsca = explode("=",$id);
//echo $id_miejsca[1]."\n";
$intro = file_get_contents("$id");

$regex = '#\<div id="firstText"\>(.+?)\<div id="secondText">#s';
preg_match($regex, $intro, $matches); // LICZBA ABONENTOW OGOLEM
$match = $matches[0];

$regex_io = '#\<td class="titleCell" valign="top"\>(.+?)\<\/td\>#s';
preg_match($regex_io, $match, $matches_io);
$match_io = $matches_io[0];
$op_int = preg_replace('/[^0-9]/','', $match_io);
echo 'operatorzy'.$op_int; // WYNIK - OPERATORZY INTERNET

echo "\n\n";

//echo $match;
$regex_f = '#\<td class="cell" colspan="3"\>(.+?)\<\/table\>#s'; // LICZBA ABONENTOW OGOLEM
preg_match($regex_f, $match, $matches_f); // LICZBA ABONENTOW OGOLEM
$match_f = $matches_f[0];
//echo $match_f; // WYNIK - PIERWSZA TABELA INTERNET

$regex_s = '#\<div id="secondText"\>(.+?)\<\/body\>#s';
preg_match($regex_s, $intro, $matches_s); // LICZBA ABONENTOW OGOLEM
$match_s = $matches_s[0];
//echo 'DRUGI_DIV:'.$match_s;

$regex_io2 = '#\<td class="titleCell" valign="top"\>(.+?)\<\/td\>#s';
preg_match($regex_io2, $match_s, $matches_io2);
$match_io2 = $matches_io2[0];
$op_int2 = preg_replace('/[^0-9]/','', $match_io2); // WYNIK - OPERATORZY TELEFONU

$regex_ts = '#\<td class="cell" colspan="3"\>(.+?)\<\/table\>#s';
preg_match($regex_ts, $match_s, $matches_ts); // LICZBA ABONENTOW OGOLEM
$match_ts = $matches_ts[0]; // WYBIK DRUGA TABELA TELEFON

$tab = '<table><tr>'.$match_f;
$tab2 = '<table><tr>'.$match_ts;

$file = "/tmp/html.$id_miejsca[1]";
$file = trim($file);
system("echo '$tab' > $file");

$file2 = "/tmp/tel.$id_miejsca[1]";
$file2 = trim($file2);
system("echo '$tab2' > $file2");

$res = tabulka($file);
$res2 = tabulka($file2);
/
$ogol = $res[0][2];
$ind = $res[1][2];
$biz = $res[2][2];

$t_ogol = $res2[0][2];
$t_ind = $res2[1][2];
$t_biz = $res2[2][2];

$regex = '#\<div\>(.+?)\<\/div\>#s';// ZWRACA GMINE/POWIAT
preg_match($regex, $intro, $matches); // ZWRACA GMINE/POWIAT
$match = $matches[0];

$miejsce = explode("-",$match);
$miejscowosc =ltrim($miejsce[1]);
$miejscowosc = iconv("ISO-8859-2","UTF-8",$miejscowosc);
polacz();

$insert = "INSERT INTO mapa (id,miejsce,ogolem,indywidualni,biznesowi,t_ogolem,t_indywidualni,t_biznesowi,op_int,op_tel) values ('$id_miejsca[1]','$miejscowosc','$ogol','$ind','$biz','$t_ogol','$t_ind','$t_biz','$op_int','$op_int2')";

mysql_query($insert);
echo $insert."\n";
}

?>

Plik uke.php

<?php
include("func.php");

$plik = fopen('urls.txt','r');
while(!feof($plik))
{

$url = fgets($plik);
$url = trim($url);
spider($url);
sleep(6);

}
?>

Plik urls.txt


http://www.mapa.uke.gov.pl/index.php?id=221105
http://www.mapa.uke.gov.pl/index.php?id=221101
http://www.mapa.uke.gov.pl/index.php?id=221103
http://www.mapa.uke.gov.pl/index.php?id=221102
http://www.mapa.uke.gov.pl/index.php?id=221107
http://www.mapa.uke.gov.pl/index.php?id=221104
http://www.mapa.uke.gov.pl/index.php?id=221106

...

Plik ten można przygotować za pomocą crawlera opisanego w tym poście

3 komentarze:

Anonimowy12 lutego 2013 10:26
Ten komentarz został usunięty przez administratora bloga.
OdpowiedzUsuń
Odpowiedzi
Anonimowy12 lutego 2013 12:44
Ten komentarz został usunięty przez administratora bloga.
OdpowiedzUsuń
Odpowiedzi
Anonimowy12 lutego 2013 14:13
Ten komentarz został usunięty przez administratora bloga.
OdpowiedzUsuń
Odpowiedzi

Dodaj komentarz

Proszę zostaw swój komentarz w celu dopowiedzenia tego czego ja nie wiedziałem lub wywołania ciekawej dyskusji. Wprowadziłem moderowanie komentarzy ze względu na dużą popularność bloga wśród różnych SEO botów :)

w4cky - BST

wtorek, 25 września 2012