queryList爬蟲獲取內容的幾種方法總結 queryList給抓取的內容增加html追加元素html 代碼實例...

//簡略內容:
1.
$data1 = $ql->find('.two img')->map(function($item){return $item->alt;
});
// 等價下面這句話
$data2 = $ql->find('.two img')->attrs('alt');2.
$texts = $ql->find('.two>a')->texts();
$htmls = $ql->find('#one span')->htmls();3.
$ql = QueryList::get('https://www.ithome.com/html/discovery/358585.htm');
$rt = [];
// 采集文章標題
$rt['title'] = $ql->find('h1')->text();4.采集列表所有  用all
$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());
//QueryList V4.0.4版本新增了一個queryData()語法糖來簡化這種操作:
$rt = QueryList::get($url)->rules($rules)->queryData();
queryData()方法等同于query()->getData()->all()

$ql = QueryList::html($html); // 獲取div元素對象 $div = $ql->find('div:eq(0)'); // 向div元素中追加一個img元素 queryList給抓取的內容增加html追加元素html 
//(在元素操作頁面文檔 不是結果處理 http://www.querylist.cc/docs/guide/v4/modify-dom) $div->append('<img src="1.jpg" />');//獲取HTTP響應頭等信息
use GuzzleHttp\Client;$client = new Client();
$response = $client->get('http://httpbin.org/get');
// 獲取響應頭部信息
$headers = $response->getHeaders();//內容過濾
// 采集正文內容
$eles = QueryList::html($html)->find('#content');
// 選擇正文內容中要移除的元素，并移除
$eles->find('.tt,span:last,p:last')->remove();
//獲取純凈的正文內容
$content = $eles->html();$rt = QueryList::rules($rules)->html($html)->query()->getData();$rt = QueryList::rules($rules)->html($html)->query()->getData(function($item){$ql = QueryList::html($item['content']);$ql->find('.tt,span:last,p:last')->remove();$item['content'] = $ql->find('')->html();return $item;});//QueryList內置的HTTP客戶端
//更多高級參數
//還可以攜帶更多高級參數，如：設置超時時間、設置代理等。
$ql = QueryList::get('http://httpbin.org/get',['param1' => 'testvalue','params2' => 'somevalue'
],[// 設置代理'proxy' => 'http://222.141.11.17:8118',//設置超時時間，單位：秒'timeout' => 30,'headers' => ['Referer' => 'https://querylist.cc/','User-Agent' => 'testing/1.0','Accept'     => 'application/json','X-Foo'      => ['Bar', 'Baz'],'Cookie'    => 'abc=111;xxx=222']]);//使用文件緩存驅動
//// 緩存文件夾路徑
//$cache_path = __DIR__.'/temp/';
$ql = QueryList::get($url,null,['cache' => $cache_path,'cache_ttl' => 600 // 緩存有效時間，單位：秒，可以不設置緩存有效時間
]);//使用 HTTP Cache
//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];

QueryList::multiGet($urls)->success(function(QueryList $ql,Response $response, $index) use($urls){echo 'Current url: '.$urls[$index]."\r\n";$data = $ql->find('h3>a')->texts();print_r($data->all());})->send();//更高級的用法//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];
//
//$rules = [
//    'name' => ['h3>a','text'],
//    'desc' => ['.py-1','text']
//];
//$range = '.repo-list>li';
QueryList::rules($rules)->range($range)->multiGet($urls)// 設置并發數為2->concurrency(2)// 設置GuzzleHttp的一些其他選項->withOptions(['timeout' => 60])// 設置HTTP Header->withHeaders(['User-Agent' => 'QueryList'])// HTTP success回調函數->success(function (QueryList $ql, Response $response, $index){$data = $ql->queryData();print_r($data);})// HTTP error回調函數->error(function (QueryList $ql, $reason, $index){// ...
    })->send();//詳細版
1.
$data1 = $ql->find('.two img')->map(function($item){return $item->alt;
});
// 等價下面這句話
$data2 = $ql->find('.two img')->attrs('alt');print_r($data1->all());2.
$texts = $ql->find('.two>a')->texts();
$htmls = $ql->find('#one span')->htmls();print_r($texts->all());3.
use QL\QueryList;$ql = QueryList::get('https://www.ithome.com/html/discovery/358585.htm');$rt = [];
// 采集文章標題
$rt['title'] = $ql->find('h1')->text();4.采集列表所有  用all
//use QL\QueryList;//$url = 'https://www.ithome.com/html/discovery/358585.htm';
//// 定義采集規則
//$rules = [
//    // 采集文章標題
//    'title' => ['h1','text'],
//    // 采集文章作者
//    'author' => ['#author_baidu>strong','text'],
//    // 采集文章內容
//    'content' => ['.post_content','html']
//];
$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());
//QueryList V4.0.4版本新增了一個queryData()語法糖來簡化這種操作:

$rt = QueryList::get($url)->rules($rules)->queryData();
queryData()方法等同于query()->getData()->all()//QueryList內置的HTTP客戶端
//更多高級參數
//還可以攜帶更多高級參數，如：設置超時時間、設置代理等。

$ql = QueryList::get('http://httpbin.org/get',['param1' => 'testvalue','params2' => 'somevalue'
],[// 設置代理'proxy' => 'http://222.141.11.17:8118',//設置超時時間，單位：秒'timeout' => 30,'headers' => ['Referer' => 'https://querylist.cc/','User-Agent' => 'testing/1.0','Accept'     => 'application/json','X-Foo'      => ['Bar', 'Baz'],'Cookie'    => 'abc=111;xxx=222']]);//使用文件緩存驅動
//// 緩存文件夾路徑
//$cache_path = __DIR__.'/temp/';
$ql =  = QueryList::get($url,null,['cache' => $cache_path,'cache_ttl' => 600 // 緩存有效時間，單位：秒，可以不設置緩存有效時間
]);//使用 HTTP Cache
//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];

QueryList::multiGet($urls)->success(function(QueryList $ql,Response $response, $index) use($urls){echo 'Current url: '.$urls[$index]."\r\n";$data = $ql->find('h3>a')->texts();print_r($data->all());})->send();//更高級的用法//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];
//
//$rules = [
//    'name' => ['h3>a','text'],
//    'desc' => ['.py-1','text']
//];
//$range = '.repo-list>li';
QueryList::rules($rules)->range($range)->multiGet($urls)// 設置并發數為2->concurrency(2)// 設置GuzzleHttp的一些其他選項->withOptions(['timeout' => 60])// 設置HTTP Header->withHeaders(['User-Agent' => 'QueryList'])// HTTP success回調函數->success(function (QueryList $ql, Response $response, $index){$data = $ql->queryData();print_r($data);})// HTTP error回調函數->error(function (QueryList $ql, $reason, $index){// ...
    })->send();//獲取HTTP響應頭等信息
use GuzzleHttp\Client;$client = new Client();
$response = $client->get('http://httpbin.org/get');
// 獲取響應頭部信息
$headers = $response->getHeaders();//內容過濾
// 采集正文內容
$eles = QueryList::html($html)->find('#content');
// 選擇正文內容中要移除的元素，并移除
$eles->find('.tt,span:last,p:last')->remove();
//獲取純凈的正文內容
$content = $eles->html();//$rules = [
//    // 移除內容中所有的超鏈接，但保留超鏈接的內容，并移除內容中所有p標簽，但保留p標簽的內容
//    'content_html' => ['#content','html','a p'],
//    // 保留內容中的超鏈接，以及保留p標簽及內容
//    'content_text' => ['#content','text','a p'],
//];

$rt = QueryList::rules($rules)->html($html)->query()->getData();//
//$rules = [
//    'content' => ['#content','html']
//];

$rt = QueryList::rules($rules)->html($html)->query()->getData(function($item){$ql = QueryList::html($item['content']);$ql->find('.tt,span:last,p:last')->remove();$item['content'] = $ql->find('')->html();return $item;});

$data1 = $ql->find('.two img')->map(function($item){? ? return $item->alt;});$data1 = $ql->find('.two img')->map(function($item){? ? return $item->alt;});// 等價下面這句話$data2 = $ql->find('.two img')->attrs('alt');
print_r($data1->all());$texts = $ql->find('.two>a')->texts();$htmls = $ql->find('#one span')->htmls();
print_r($texts->all());

轉載于:https://www.cnblogs.com/stillstep/p/10953512.html