使用selenium抓取华尔街见闻和新浪财经数据

1,147 阅读1分钟

# 新浪财经数据采集

import requests
import pymongo
import time

from selenium import webdriver
from bs4 import BeautifulSoup
# from fake_useragent import UserAgent
# ua_list = UserAgent()
ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'


def get_hej_news():
    """爬取华尔街见闻宏观新闻"""
    client = pymongo.MongoClient('localhost', 27017)
    news = client['news']
    hej_news = news['hej_news']
    chromedriver = r"/usr/local/share/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    # 使用get()方法打开待抓取的URL
    driver.get('https://wallstreetcn.com/live/global')
    # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
    js = 'window.scrollBy(0,3000)'
    driver.execute_script(js)
    time.sleep(5)
    js = 'window.scrollBy(0,5000)'
    driver.execute_script(js)
    time.sleep(5)
    pages = driver.page_source
    soup = BeautifulSoup(pages, 'html.parser')
    soup1 = soup.find('div', class_='livenews')
    content = soup1.find_all('div', class_='live-item')

    for i in content:
        new_time = i.find('span', attrs={'class': 'live-item__time__text'}).get_text(),
        news = i.find('div', attrs={'class': 'content-html'}).get_text().strip().replace('\n|//', '')
        isexit = hej_news.count({'new_time': new_time})
        if isexit != 0:
            hej_news.remove({'new_time': new_time})
        data = {
            'new_time': new_time,
            'news': news
        }
        hej_news.insert_one(data)

    driver.close()
    driver.quit()
    print('存储华尔街见闻宏观新闻成功')


def get_xlcj_news():
    """爬取新浪财经突发live板块新闻"""
    client = pymongo.MongoClient('localhost', 27017)
    news = client['news']
    xlcj_news = news['xlcj_news']

    num = 1
    while num < 7:
        chromedriver = r"/usr/local/share/chromedriver"
        driver = webdriver.Chrome(chromedriver)
        url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num)
        # 使用get()方法打开待抓取的URL
        driver.get(url)
        # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
        js = 'window.scrollBy(0,3000)'
        driver.execute_script(js)
        time.sleep(5)
        js = 'window.scrollBy(0,5000)'
        driver.execute_script(js)
        time.sleep(5)
        pages = driver.page_source
        soup = BeautifulSoup(pages, 'html.parser')
        soup1 = soup.find('div', class_='bd_list')
        content = soup1.find_all('div', class_='bd_i_og')
        num += 1
        for i in content:
            news_time = i.find('p', attrs={'class': 'bd_i_time_c'}).get_text().strip()
            news_type = i.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "")
            news = i.find('p', attrs={'class': 'bd_i_txt_c'}).get_text()
            print(news_time,news_type,news)

            isexit = xlcj_news.count({'news_time': news_time})
            if isexit != 0:
                xlcj_news.remove({'news_time': news_time})
            data = {
                'news_time': news_time,
                'news_type': news_type,
                'news': news
            }
            xlcj_news.insert_one(data)
        driver.close()
        driver.quit()
    print('新浪财经突发live板块新闻存储成功')


def main():
    # his_time = input('请输入要查询的新闻时间(格式:2017-11-2 00:00:00):')
    # history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '')
    get_hej_news()
    get_xlcj_news()


if __name__ == '__main__':
    main()