web-dev-qa-db-ja.com

PHP並列カールリクエスト

15の異なるURLからjsonデータを読み取る単純なアプリを実行しています。私はこれをサーバーで行う必要があるという特別なニーズがあります。 file_get_contents($url)を使用しています。

私はfile_get_contents($ url)を使用しているので。私は簡単なスクリプトを書きました、それは:

$websites = array(
    $url1,
    $url2,
    $url3,
     ...
    $url15
);

foreach ($websites as $website) {
    $data[] = file_get_contents($website);
}

そして、最初のリクエストを待ってから次のリクエストを行うため、非常に遅いことが証明されました。

37
user1205408

あなたがマルチカールを意味するなら、このようなものが役立つかもしれません:


$nodes = array($url1, $url2, $url3);
$node_count = count($nodes);

$curl_arr = array();
$master = curl_multi_init();

for($i = 0; $i < $node_count; $i++)
{
    $url =$nodes[$i];
    $curl_arr[$i] = curl_init($url);
    curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
    curl_multi_add_handle($master, $curl_arr[$i]);
}

do {
    curl_multi_exec($master,$running);
} while($running > 0);


for($i = 0; $i < $node_count; $i++)
{
    $results[] = curl_multi_getcontent  ( $curl_arr[$i]  );
}
print_r($results);

それが何らかの形で役立つことを願っています

134

CPUを100%にして、わずかなエラーや予期しない何かが発生したときにクラッシュすることなく、より完全な例を提供したいと思います。

また、ヘッダー、本文、リクエスト情報、手動リダイレクトのフォローを取得する方法も示します。

免責事項、このコードは拡張されてライブラリに実装されるか、または迅速な開始点として意図されているため、その内部の機能は最小限に抑えられています。

function mtime(){
    return microtime(true);
}
function ptime($prev){
    $t = microtime(true) - $prev;
    $t = $t * 1000;
    return str_pad($t, 20, 0, STR_PAD_RIGHT);
}


function curl_multi_exec_full($mh, &$still_running) {
    // In theory curl_multi_exec should never return CURLM_CALL_MULTI_PERFORM (-1) because it has been deprecated
    // In practice it sometimes does
    // So imagine that this just runs curl_multi_exec once and returns it's value
    do {
        $state = curl_multi_exec($mh, $still_running);

        // curl_multi_select($mh, $timeout) simply blocks for $timeout seconds while curl_multi_exec() returns CURLM_CALL_MULTI_PERFORM
        // We add it to prevent CPU 100% usage in case this thing misbehaves
    } while ($still_running > 0 && $state === CURLM_CALL_MULTI_PERFORM && curl_multi_select($mh, 0.1));
    return $state;
}
function curl_multi_wait($mh, $minTime = 0.001, $maxTime = 1){
    $umin = $minTime*1000000;

    $start_time = microtime(true);

    // it sleeps until there is some activity on any of the descriptors (curl files)
    // it returns the number of descriptors (curl files that can have activity)
    $num_descriptors = curl_multi_select($mh, $maxTime);

    // if the system returns -1, it means that the wait time is unknown, and we have to decide the minimum time to wait
    // but our `$timespan` check below catches this Edge case, so this `if` isn't really necessary
    if($num_descriptors === -1){
        usleep($umin);
    }

    $timespan = (microtime(true) - $start_time);

    // This thing runs very fast, up to 1000 times for 2 urls, which wastes a lot of CPU
    // This will reduce the runs so that each interval is separated by at least minTime
    if($timespan < $umin){
        usleep($umin - $timespan);
        //print "sleep for ".($umin - $timeDiff).PHP_EOL;
    }
}


$handles = [
    [
        CURLOPT_URL=>"http://example.com/",
        CURLOPT_HEADER=>false,
        CURLOPT_RETURNTRANSFER=>true,
        CURLOPT_FOLLOWLOCATION=>false,
    ],
    [
        CURLOPT_URL=>"http://www.php.net",
        CURLOPT_HEADER=>false,
        CURLOPT_RETURNTRANSFER=>true,
        CURLOPT_FOLLOWLOCATION=>false,

        // this function is called by curl for each header received
        // This complies with RFC822 and RFC2616, please do not suggest edits to make use of the mb_ string functions, it is incorrect!
        // https://stackoverflow.com/a/41135574
        CURLOPT_HEADERFUNCTION=>function($ch, $header)
        {
            print "header from http://www.php.net: ".$header;
            //$header = explode(':', $header, 2);
            //if (count($header) < 2){ // ignore invalid headers
            //    return $len;
            //}

            //$headers[strtolower(trim($header[0]))][] = trim($header[1]);

            return strlen($header);
        }
    ]
];




//create the multiple cURL handle
$mh = curl_multi_init();

$chandles = [];
foreach($handles as $opts) {
    // create cURL resources
    $ch = curl_init();

    // set URL and other appropriate options
    curl_setopt_array($ch, $opts);

    // add the handle
    curl_multi_add_handle($mh, $ch);

    $chandles[] = $ch;
}


//execute the multi handle
$prevRunning = null;
$count = 0;
do {
    $time = mtime();

    // $running contains the number of currently running requests
    $status = curl_multi_exec_full($mh, $running);
    $count++;

    print ptime($time).": curl_multi_exec status=$status running $running".PHP_EOL;

    // One less is running, meaning one has finished
    if($running < $prevRunning){
        print ptime($time).": curl_multi_info_read".PHP_EOL;

        // msg: The CURLMSG_DONE constant. Other return values are currently not available.
        // result: One of the CURLE_* constants. If everything is OK, the CURLE_OK will be the result.
        // handle: Resource of type curl indicates the handle which it concerns.
        while ($read = curl_multi_info_read($mh, $msgs_in_queue)) {

            $info = curl_getinfo($read['handle']);

            if($read['result'] !== CURLE_OK){
                // handle the error somehow
                print "Error: ".$info['url'].PHP_EOL;
            }

            if($read['result'] === CURLE_OK){
                /*
                // This will automatically follow the redirect and still give you control over the previous page
                // TODO: max redirect checks and redirect timeouts
                if(isset($info['redirect_url']) && trim($info['redirect_url'])!==''){

                    print "running redirect: ".$info['redirect_url'].PHP_EOL;
                    $ch3 = curl_init();
                    curl_setopt($ch3, CURLOPT_URL, $info['redirect_url']);
                    curl_setopt($ch3, CURLOPT_HEADER, 0);
                    curl_setopt($ch3, CURLOPT_RETURNTRANSFER, 1);
                    curl_setopt($ch3, CURLOPT_FOLLOWLOCATION, 0);
                    curl_multi_add_handle($mh,$ch3);
                }
                */

                print_r($info);
                $body = curl_multi_getcontent($read['handle']);
                print $body;
            }
        }
    }

    // Still running? keep waiting...
    if ($running > 0) {
        curl_multi_wait($mh);
    }

    $prevRunning = $running;

} while ($running > 0 && $status == CURLM_OK);

//close the handles
foreach($chandles as $ch){
    curl_multi_remove_handle($mh, $ch);
}
curl_multi_close($mh);

print $count.PHP_EOL;
0
Timo Huovinen