Napisałem sobie taką oto klasę:
<?php
class Scrapper{
public $url;
private $data;
private $dataAfter;
private $doc;
private $xpath;
private $ch;
function __construct($url){
libxml_use_internal_errors(true);
$this->url = $url;
$this->data = $this->curl($this->url);
$this->doc = new \DOMDocument();
$this->doc->loadHTML($this->data);
$this->xpath = new DOMXPath($this->doc);
}
}
public function queryTag($query){
$this->data = $this->xpath->query($query);
return $this;
}
}
public function getData($noHTML = false, $removeAttribute = false){
foreach ($this->data as $dataNodes){
if($removeAttribute === true) {
$dataNodes->removeAttribute('style');
$dataNodes->removeAttribute('class');
$dataNodes->removeAttribute('id');
}
if($noHTML === true){
$this->dataAfter .= $dataNodes->nodeValue;
}else{
$this->dataAfter .= $dataNodes->ownerDocument->saveHTML($dataNodes);
}
}
return $this->dataAfter;
}
private function curl($url){
CURLOPT_RETURNTRANSFER => TRUE, // Setting cURL's option to return the webpage data
CURLOPT_FOLLOWLOCATION => TRUE, // Setting cURL to follow 'location' HTTP headers
CURLOPT_AUTOREFERER => TRUE, // Automatically set the referer where following 'location' HTTP headers
CURLOPT_CONNECTTIMEOUT => 120, // Setting the amount of time (in seconds) before the request times out
CURLOPT_TIMEOUT => 120, // Setting the maximum amount of time for cURL to execute queries
CURLOPT_MAXREDIRS => 10, // Setting the maximum number of redirections to follow
CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", // Setting the useragent
CURLOPT_URL => $this->url, // Setting cURL's URL option with the $url variable passed into the function
);
$this->ch = curl_init();
curl_setopt_array($this->ch, $options);
$this->data = curl_exec($this->ch);
return $this->data;
}
}
function __destruct(){
curl_close($this->ch);
}
}
$class = new \Scrapper('http://www.....');
$pic = $class->queryTag('//div[@id="left"]//img[@class="pic"]/@src')->getData();
$title = $class->queryTag('//div[@id="left"]//h2')->getData(true);
$text = $class->queryTag('//div[@id="left"]/p | //center')->getData(false, true);
Po wywołaniu tej klasy, przypisuję do każdej zmiennej szukanej wartości - zdjęcie, tytuł i treść.
Niestety tytuł zawiera również ciąg URL obrazka, natomiast tekst zawiera dodatkowo obrazek oraz tytuł. Gdzie robię błąd? Jak to oddzielić?
Jednocześnie proszę o sugestię co mogę poprawić w samej klasie.
Ten post edytował SN@JPER^ 24.11.2017, 17:47:16