PHP爬虫 -- 019 实战3 爬取BOSS直聘的招聘信息(思路分析)

940 阅读1分钟

思路分析, 为后面的讲解做准备

  • 我们点击测试

  • 我们看一下url

url的规律

我们需要爬取的数据

创建数据表

CREATE TABLE `jobs` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `company_name` varchar(255) NOT NULL,
  `job_name` varchar(255) NOT NULL,
  `work_year` varchar(255) NOT NULL,
  `education` varchar(255) NOT NULL,
  `address` varchar(255) NOT NULL,
  `city` varchar(255) NOT NULL,
  `description` text NOT NULL,
  `url` varchar(255) NOT NULL,
  `salary` varchar(10) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

参考代码

<?php
require 'vendor\autoload.php';
use QL\QueryList;
use Medoo\Medoo;
// 创建数据库连接
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8',
]);

// 搜索的关键字
$keyword = 'php';
$count = 1;
// 城市列表
$city_arr = [
    // "北京"=>'c101010100',
    // "上海"=>'c101020100',
    // "广州"=>'c101280100',
    // "深圳"=>'c101280600',
    // "杭州"=>'c101210100',
    "郑州"=>'c101180100'
];

// 当前城市
$current_city = "";
// QueryList对象
$ql = new QueryList();

function start(){
    global $city_arr,$keyword,$current_city;
    foreach ($city_arr as $key => $value) {
        $current_city = $key;
        $url = "https://www.zhipin.com/{$value}/?query={$keyword}";
        $detail_url_list = get_detail_list($url);
        foreach ($detail_url_list as $key => $value) {
            get_and_save_data($value['detail_url']);
        }
    }
}
start();
function get_detail_list($url){
    global $ql;
    $data = [];
    for ($i=0; $i < 99; $i++) { 
        $current_url = $url."&page=".($i+1);
        $tmp = $ql->html(get_html_source($current_url))->rules([
            'detail_url'=>['#main > div > div.job-list > ul > li > div > div.info-primary > h3 > a','href','',function($a_href){
                return "https://www.zhipin.com".$a_href;
            }]
        ])->queryData();
        $data = array_merge($data,$tmp);
        $next = $ql->html(get_html_source($current_url))->find('a.next[ka="page-next"]')->attr('class');
        if ($i == 2) {
            break;
        }
        if($next == "next disabled"){
            break;
        }
    }
    return $data;

}

function get_and_save_data($url){
    echo $url."\n";
    global $ql,$current_city,$count;
    $percent = ($count/90)*100;
    echo "现在是第{$count}页, 总共90页, 当前进度{$percent}%.\n";
    $count++;
    $html_source = get_html_source($url);
    $data = [];
    $data['company_name'] = $ql->html($html_source)->find('div.job-sec > div.name')->text();
    if(!$data['company_name']){
        $data['company_name'] = $ql->html($html_source)->find("a[ka='job-detail-company_custompage']")->text();
    }
    $data['salary'] = $ql->html($html_source)->find("div.name>span.salary")->text();
    $data['job_name'] = $ql->html($html_source)->find('div.info-primary>div.name>h1')->text();
    $info = $ql->html($html_source)->find('div.job-primary.detail-box>div.info-primary > p')->html();
    $data['work_year'] = explode('<em class="dolt"></em>',$info)[1];
    $data['education'] = explode('<em class="dolt"></em>',$info)[2];
    $data['address'] = $ql->html($html_source)->find('div.location-address')->text();
    $data['city'] = $current_city;
    $data['description'] = $ql->html($html_source)->find("div.detail-content>div:nth-child(1).job-sec>div.text")->text();
    $data['url'] = $url;
    if(!$data['url']){
        echo "fuck";
        print_r($data);die;
    }
    save_data($data);
}

function save_data($data){
    global $database;
    $database->insert('jobs',$data);
}
/* 
 * @Description: 使用IP代理获取html代码
 * @param: 目标url
 * @return: html代码
*/ 
function get_html_source($url) {
    $result = false;
    while (!$result) {
        // 要访问的目标页面
        $targetUrl = $url;
        echo $targetUrl."\n";
        // 代理服务器
        $proxyServer = "http://http-dyn.abuyun.com:9020";
        // 隧道身份信息
        $proxyUser = "H19D75L76VK89Q8D";
        $proxyPass = "8C17B0A80F475BD8";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $targetUrl);
        curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        // 设置代理服务器
        curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
        curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
        // 设置隧道验证信息
        curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
        curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
        curl_setopt($ch, CURLOPT_TIMEOUT, 5);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        $result = curl_exec($ch);
        curl_close($ch);
        // if (!$result) {
        //     sleep(5);
        // }
    }
    return $result;
}