Cytat(SmokAnalog @ 9.02.2018, 23:52:51 )

Darko, nie oUbolewam nad tym, że przez całą noc crawluję zaledwie 20-30 tys. wyników,
Jeśli to za mało to mam kilka propozycji
* cURL zamiast file_get_contnets (w wielu wypadkach cURL jest szybsze, więcej rzeczy można ustawić)
* curl_multi - aby zwielokrotnić ilość pobieranych zasobów
* kilka procesów na raz
Prosty przykład:
<?php
for($i = 259010; $i<259310; $i++){
$url = 'http://forum.php.pl/index.php?showtopic=' . $i;
echo date('H:i:s').'.'.gettimeofday()['usec'].' | ID: '.$i.' length: '.strlen($html).' | usage: '.$usage.' | usage(true) '.$usageTrue.PHP_EOL
; }
Test (obcięte wyniki do dwóch pierwszych i ostatnich)
$ time php file_get_contents.php
13:48:46.202671 | ID: 259010 length: 221566 | usage: 592840 | usage(true) 2097152
13:48:46.291102 | ID: 259011 length: 40984 | usage: 414280 | usage(true) 2097152
..
13:49:54.433896 | ID: 259308 length: 38852 | usage: 410184 | usage(true) 2097152
13:49:54.573305 | ID: 259309 length: 76766 | usage: 447048 | usage(true) 2097152
real 1m8.822s
user 0m0.193s
sys 0m0.284s
Wersja na curl:
<?php
for($i = 259010; $i<259310; $i++){
$url = 'http://forum.php.pl/index.php?showtopic=' . $i;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($ch, CURLOPT_HEADER, 0);
$html = curl_exec($ch);
curl_close($ch);
echo date('H:i:s').'.'.gettimeofday()['usec'].' | ID: '.$i.' length: '.strlen($html).' | usage: '.$usage.' | usage(true) '.$usageTrue.PHP_EOL
; }
$ time php curl.php
13:50:54.645658 | ID: 259010 length: 221568 | usage: 594096 | usage(true) 2097152
13:50:54.875325 | ID: 259011 length: 40984 | usage: 415536 | usage(true) 2097152
13:51:41.684053 | ID: 259308 length: 38852 | usage: 411440 | usage(true) 2097152
13:51:41.888811 | ID: 259309 length: 76766 | usage: 448304 | usage(true) 2097152
real 0m48.066s
user 0m0.227s
sys 0m0.270s
A bez
curl_setopt($ch, CURLOPT_HEADER, 0); trwało porównywalnie do wersji z
file_get_contents:
real 0m56.110s
user 0m0.176s
sys 0m0.185s
Wersja z
curl_multi<?php
$urls = [];
for($i = 259010; $i<259310; $i++){
$urls[] = $i;
}
$ch_multi = curl_multi_init();
$ch_arrr = [];
$options = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 3,
CURLOPT_HEADER => 0,
CURLOPT_CONNECTTIMEOUT => 3,
];
for ($i = 0; $i < 2; $i++){
$ch = curl_init();
$options[CURLOPT_URL] = 'http://forum.php.pl/index.php?showtopic='.$urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($ch_multi, $ch);
}
do {
while (($execrun = curl_multi_exec($ch_multi, $running)) == CURLM_CALL_MULTI_PERFORM);
if($execrun != CURLM_OK){
break;
}
while ($done = curl_multi_info_read($ch_multi)) {
$html = curl_multi_getcontent($done['handle']);
echo date('H:i:s').'.'.gettimeofday()['usec'].' | ID: '.$urls[$i].' length: '.strlen($html).' | usage: '.$usage.' | usage(true) '.$usageTrue.PHP_EOL
;
++$i;
$ch = curl_init();
$options[CURLOPT_URL] = 'http://forum.php.pl/index.php?showtopic='.$urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($ch_multi, $ch);
}
curl_multi_remove_handle($ch_multi, $done['handle']);
}
} while ($running);
curl_multi_close($ch_multi);
$ time php curl_multi.php
13:55:01.296270 | ID: 259011 length: 40987 | usage: 442440 | usage(true) 2097152
13:55:01.425666 | ID: 259012 length: 41189 | usage: 444104 | usage(true) 2097152
13:55:33.281197 | ID: 259308 length: 76766 | usage: 518152 | usage(true) 2097152
13:55:33.370113 | ID: 259309 length: 38852 | usage: 518112 | usage(true) 2097152
real 0m32.255s
user 0m20.641s
sys 0m11.609s
Podsumowanie* file_get_contents: 1m8.822s
* curl: 0m48.066s
* curl_multi: 0m32.255s