继上次(爬取twitter数据)在github上寻找代码看不懂的后续尝试:

其中包含selenium登录&异步加载&xpath--由于twitter仅展现近一个周的数据,所以当前数据爬取也是不全面的,还需要继续调整代码。

from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html

# 获取页面内所有帖子的url
def get_posts(url):
    """
    url:包含所有帖子的浏览页面
    """
    wb = webdriver.Chrome()
    wb.get(url)
    time.sleep(3)
    
#处理网页加载
    js = 'return action=document.body.scrollHeight'
    height = wb.execute_script(js)
    wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(5)

    t1 = int(time.time())
    status = True
    num = 0
    
    post_list = []

    while status:
        t2 = int(time.time())
        if t2 - t1 < 30:#一边翻页一边读取网页源码,由于twitter异步加载后翻页的源码不全,所以在翻页过程中获取网页源码,但是获取的数据需要进行去重处理
            selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
            infos =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
            for info in infos:
                post = info.xpath("string(.)").strip()
                post_list.append(post)
            new_height = wb.execute_script(js)
            if new_height > height:
                time.sleep(1)
                wb.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight)')
                height = new_height
                t1 = int(time.time())
        elif num < 3:
            time.sleep(3)
            num = num + 1
        else:  # 超时且重试后停止,到底页面底部
            status = False
    return post_list

url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
post_list = get_posts(url)
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 13 22:24:58 2021

@author: 18742
"""
from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html
#直接获得这个块,再使用string(.)


##获取页面中的数据
#def get_info2(wb):
#    wb.implicitly_wait(10)
##    post =  wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n']/div/span").text
#    post =  wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#    
#    data =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#    post_list = []
#    for i in range(len(data)):
#        post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行
#        print(post)
#        post = str(post)
#        post_list.append("".join(post))
#
#    like = wb.find_element_by_xpath("//*/div[@data-testid='like']//div/span/span").text
#    retweet = wb.find_element_by_xpath("//*/div[@data-testid='retweet']//div/span/span").text
#    reply = wb.find_element_by_xpath("//*/div[@data-testid='reply']//div/span/span").text
#    data = {
##                "good":good,
#        "post":post,
#        "like":like,
#        "retweet":retweet,
#        "reply":reply}
#    return data
#
#
#
## 获取页面内所有帖子的url
#def get_posts(url):
#    """
#    url:包含所有帖子的浏览页面
#    """
#    wb = webdriver.Chrome()
#    wb.get(url)
#    time.sleep(3)
#    
#
#    js = 'return action=document.body.scrollHeight'
#    height = wb.execute_script(js)
#    wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
#    time.sleep(5)
#
#    t1 = int(time.time())
#    status = True
#    num = 0
#
#    while status:
#        t2 = int(time.time())
#        if t2 - t1 < 30:
#            new_height = wb.execute_script(js)
#            if new_height > height:
#                time.sleep(1)
#                wb.execute_script(
#                    'window.scrollTo(0, document.body.scrollHeight)')
#                height = new_height
#                t1 = int(time.time())
#        elif num < 3:
#            time.sleep(3)
#            num = num + 1
#        else:  # 超时且重试后停止,到底页面底部
#            status = False
#
#    data = get_info2(wb)
#    
#    return data





# ==可以跑通的部分===========================================================================
# #chromedriver直接获取posts
# ##获取url链接
# url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
# post_list = get_posts(url)
# 
# 
# wb = webdriver.Chrome()
# wb.get(url)
# time.sleep(3)
# selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
# like = selector.xpath("//div[@data-testid='like']//div/span/span/text()")
# print(like)
# 
# data =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
# post_list = []
# for i in range(len(data)):
#     post = data[i].xpath('string(.)').strip()#将文本进行合并并去除文字前后的空行
#     print(post)
#     post = str(post)
#     post_list.append("".join(post))
# 
# 
# =============================================================================


# 获取页面内所有帖子的url
def get_posts(url):
    """
    url:包含所有帖子的浏览页面
    """
    wb = webdriver.Chrome()
    wb.get(url)
    time.sleep(3)
    
#处理网页加载
    js = 'return action=document.body.scrollHeight'
    height = wb.execute_script(js)
    wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(5)

    t1 = int(time.time())
    status = True
    num = 0
    
    post_list = []

    while status:
        t2 = int(time.time())
        if t2 - t1 < 30:
            selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
            infos =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")#//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div
            for info in infos:
                post = info.xpath("string(.)").strip()
                post_list.append(post)
            new_height = wb.execute_script(js)
            if new_height > height:
                time.sleep(1)
                wb.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight)')
                height = new_height
                t1 = int(time.time())
        elif num < 3:
            time.sleep(3)
            num = num + 1
        else:  # 超时且重试后停止,到底页面底部
            status = False
    return post_list

##获取页面html,或许是因为异步加载的数据的完整页面代码无法获取的原因
#    selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
#    infos =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#    
#    post_list = []
#    for info in infos:
#        post = info.xpath("string(.)").strip()
#        post_list.append(post)
#    return post_list
#
#    #使用webelement来进行爬取
#    infos =  wb.find_element_by_xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
#    post_list = []
#    for info in infos:
#        post = info.text.strip()
#        post_list.append(post)
#    return post_list    


url = 'https://twitter.com/search?q=Beijing%20Winter%20Olympics%20Opening%20Ceremony&src=typed_query'
post_list = get_posts(url)
comm_df = pd.DataFrame(post_list)
print('here')
comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_Olympic_ceremony2.csv', encoding='utf_8_sig', index=False)



##
##只要推文的数据
#url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nuclear.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=shenzhou-13&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_shenzhou.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=China%20lunar%20soil&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinalunar.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Abdulrazak%20Gurnah%20Nobel%20Prize%20in%20Literature&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_nobel.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Vietnam%20Factories%20&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_vietnam.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=China%20provide%20vaccines&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_chinavaccine.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=Impact%20of%20Brexit%20on%20economy%20%27worse%20than%20Covid%27&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_brexiteconomy.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=rich%20countries%20hogging%20vaccines&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_richhogging.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=ease%20travel%20restrictions&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_easetravelres.csv', encoding='utf_8_sig', index=False)
#
#url = 'https://twitter.com/search?q=US%20reaches%20agreement%20to%20end%20European%20digital%20services%20taxes&src=typed_query'
#post_list = get_posts(url)
#comm_df = pd.DataFrame(post_list)
#comm_df.to_csv(r'C:\Users\18742\Desktop\毕业论文\代码\post_twitter_agreeontaxr.csv', encoding='utf_8_sig', index=False)



#
#1,2,3(1,2,3,4)
#
#   
#
#like = info.xpath("//div[@data-testid='like']//div/span/span/text()")
#retweet = info.xpath("//div[@data-testid='retweet']//div/span/span/text()")
#reply = info.xpath("//div[@data-testid='reply']//div/span/span/text()")
#data = {
#    "post":post,
#    "like":like,
#    "retweet":retweet,
#    "reply":reply}
#
#
#
#
#
#
##获取页面中的数据
#def get_info(wb,url,list,m):
#    driver.implicitly_wait(10)
#    m.append(driver.page_source)
#    selector = html.etree.HTML(driver.page_source)# # 是将HTML转化为二进制/html 格式
#    #可以先获取代码,再慢慢尝试写出一个可以使用的xpath或者正则表达式
#    m.append(selector)
#
#    infos = selector.xpath("//div[@class='css-1dbjc4n']")#相当于一个网页中有多个这个结构,然后把所有这个结构的数据都放在list中遍历取出需要的数据
#    m.append(infos)
##    print(infos)
#    for info in infos:
#        ###需要的信息
# #       data = info.xpath("//a[@class='J_ClickStat']/@href")##找商品的名字,写一个大一点的范围
##        good = data.xpath("string(.)").strip()
#        post = []
#        data= info.xpath("//*/div[@class='css-1dbjc4n']/div/span/text()")
#        for i in range(len(data)):
#            post.append(data[i].xpath('string(.)'))
##        post= data.xpath("string(.)").strip()
##        for span in post:
##            #格式化当前节点
##            post =span.xpath('string(.)')
#        like = info.xpath("//*/div[@data-testid='like']//span/span/text()")
#        
#        retweet = info.xpath("//*/div[@data-testid='retweet']//span/span/text()")
#        reply = info.xpath("//*/div[@data-testid='reply']//span/span/text()")
#        data = {
##                "good":good,
#                "post":post,
#                "like":like,
#                "retweet":retweet,
#                "reply":reply
#        }
#        print(data)
#        list.append(data)
#    return list
#