php爬取豆瓣电影信息
这个呢就是一个api接口,哈哈
<?php require "./Res.php"; $name = "千与千寻"; $url = "https://movie.douban.com/j/subject_suggest?q=".$name; $curl = curl_init(); // 启动一个CURL会话 curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在 $tmpInfo = curl_exec($curl); //返回api的json对象 $tmpInfo = json_decode($tmpInfo); // var_dump($tmpInfo);die; $arrat_res = []; foreach ($tmpInfo as $v) { if ($name == $v->title) { $arrat_res[] = $v; } } if (empty($arrat_res)) { $data = [ "code"=>10001, "msg"=>"暂无片源信息" ]; echo json_encode($data);die; } $url2 = $arrat_res[0]->url; curl_setopt($curl, CURLOPT_URL, $url2); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); // 从证书中检查SSL加密算法是否存在 $tmpInfo2 = curl_exec($curl); //返回api的json对象 if (!$tmpInfo2) { echo "<br />cURL error number:" .curl_errno($curl); echo "<br />cURL error:" . curl_error($curl); exit; } //创建一个DomDocument对象,用于处理一个HTML $dom = new DOMDocument(); //从一个字符串加载HTML @$dom->loadHTML($tmpInfo2); //使该HTML规范化 $dom->normalize(); //用DOMXpath加载DOM,用于查询 $xpath = new DOMXPath($dom); //获取导演信息 $directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()"); $directors_res = ""; for ($i = 0; $i < $directors->length; $i++) { $director = $directors->item($i); $director = $director->nodeValue; if ($i != 0) { $directors_res = $directors_res.",".$director; }else{ $directors_res = $director; } } //名称 $name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()"); if (!empty($name->length)) { $name = $name->item(0)->nodeValue; } //年份 $years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()"); if (!empty($years->length)) { $years = $years->item(0)->nodeValue; } //海报 //*[@id="mainpic"]/a/img $img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src"); if (!empty($img->length)) { $img = $img->item(0)->nodeValue; } // var_dump($img);die; //是否上映 //*[@id="interest_sectl"]/div/div[2]/div/div[2] $is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]"); if (!empty($is_on->length)) { $is_on = $is_on->item(0)->nodeValue; if (trim($is_on) == "尚未上映") { $is_on = 1; }else{ $is_on = 2; } } // var_dump($is_on);die; //获取编剧信息 $screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()"); $screenwriters_res = ""; for ($i = 0; $i < $screenwriters->length; $i++) { $screenwriter = $screenwriters->item($i); $screenwriter = $screenwriter->nodeValue; if ($i != 0) { $screenwriters_res = $screenwriters_res. ",".$screenwriter; }else{ $screenwriters_res = $screenwriter; } } //获取演员信息 //*[@id="info"]/span[3]/span[2]/span[1]/a $actors = $xpath->query("//*[@id='info']/span[3]/span[2]"); $actors_res = ""; for ($i = 0; $i < $actors->length; $i++) { $actor = $actors->item($i); $actor = $actor->nodeValue; if ($i != 0) { $actors_res = $actors_res. ",".$actor; }else{ $actors_res = $actor; } } // $types = $xpath->query("//*[@id='info']/span[30]"); // var_dump($types->item(0)->nodeValue);die; //获取类型 $getfunction = new getFunction(); $sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath); $types_res = $sear_res["res"]; $num = $sear_res["num"]; //获取语言 $attr = []; $langs = $xpath->evaluate("//*[@id='info']/text()"); for ($i = 0; $i < $langs->length; $i++) { $lang = $langs->item($i); $lang = $lang->nodeValue; if (preg_match('/[\x{4e00}-\x{9fa5}]/u', $lang)>0) { $attr [] = $lang; } } // var_dump($attr);die; // if (count($attr) == 3) { // // code... // } if ($is_on == 1) { $show_res = ""; $sear2_res = $getfunction->getRes($num+4,"又名:",$xpath); $time_res = $sear2_res["res"]; $num = $sear2_res["num"]; }else{ //获取上映时间 $sear2_res = $getfunction->getRes($num+4,"片长:",$xpath); $time_res = $sear2_res["res"]; $num = $sear2_res["num"]; //时长 $sear3_res = $getfunction->getRes($num+1,"又名:",$xpath); $show_res = $sear3_res["res"]; $num = $sear3_res["num"]; } if (count($attr) == 4) { $show_res = $show_res.$attr[2]; $country = $attr[0]; $languages = $attr[1]; $byname = $attr[3]; }else{ $country = $attr[0]; $languages = $attr[1]; $byname = $attr[2]; } $imbd = ""; $urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href"); if (!empty($urlim->length)) { $urlim = $urlim->item(0)->nodeValue; //获取url $urls = ""; $urls = $xpath->evaluate("//*[@id='info']/a[1]/@href"); if (!empty($urls->length)) { $urls = $urls->item(0)->nodeValue; } }else{ $urls = ""; $urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href"); if (!empty($urlim->length)) { $urlim = $urlim->item(0)->nodeValue; } } $final_res = [ "all_name" => $name.$years, "name" => $name, "year" => $years, "img" => $img, "directors" => $directors_res, "screenwriters" => $screenwriters_res, "actors" => $actors_res, "types" => $types_res, "web_url" => $urls, "country" => $country, "languages" => $languages, "ontime" => $time_res, "showtime" => $show_res, "byname" => $byname, "imbd" => $urlim ]; $return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ]; echo json_encode($return,true);Res.php
<?php class getFunction{ public static function getRes($start,$key,$xpath){ $res = ""; $num = ""; // $key = "官方网站:"; for($i = $start; $i<30; $i++ ){ $types = $xpath->query("//*[@id='info']/span[".$i."]"); if (!empty($types->length)) { $info_res = $types->item(0)->nodeValue; if ($info_res == $key) { $num = $i; }elseif ($info_res == "官方网站:") { $num = $i; }else{ if(empty($num)){ if ($i != $start) { $res = $res. ",".$info_res; }else{ $res = $info_res; } } } } } $data = ["res"=>$res,"num"=>$num]; return $data; } }
- 版权申明:此文如未标注转载均为本站原创,自由转载请表明出处《龙行博客》。
- 本文网址:https://www.liaotaoo.cn/257.html
- 上篇文章:root被我禁止远程登录了
- 下篇文章:git pull一直弹出vim编辑器解决日记