微信搜索superit|邀请体验:大数据, 数据管理、OLAP分析与可视化平台 | 赞助作者:赞助作者

抓数据

架构 aide_941 1℃ 0评论

https://googlechromelabs.github.io/chrome-for-testing/

https://storage.googleapis.com/chrome-for-testing-public/125.0.6422.141/win64/chromedriver-win64.zip

driver要和浏览器版本完全一致

 

login

import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

# 设置 ChromeDriver 路径(替换成你自己的路径)
chrome_driver_path = 'F:/python/spider/chromedriver-win64/chromedriver.exe'

# 配置 Chrome 选项
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(executable_path=chrome_driver_path), options=options)

try:
    # 打开小红书主页并手动登录
    driver.get("https://zujuan.xkw.com/gzsx/zsd27925")
    time.sleep(40)  # 给用户足够的时间手动登录

    # 获取登录后的 cookie
    cookies = driver.get_cookies()

    # 打印当前工作目录
    print("Current working directory: ", os.getcwd())

    # 将 cookie 保存到文件
    with open("cookies.json", "w") as file:
        json.dump(cookies, file)
    print("Cookies saved successfully.")
finally:
    driver.quit()

 

import json
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import NoSuchElementException
import random
import time

# 设置 ChromeDriver 路径
chrome_driver_path = 'F:/python/spider/chromedriver-win64/chromedriver.exe'

MAX_ITEMS_BEFORE_WRITING = 20  # 每收集 20 条数据就写入一次文件

def write_to_file(collected_items):
    # 追加写入文件的逻辑
    with open('result.json', 'a', encoding='utf-8') as file:
        json_data = [json.dumps(item, ensure_ascii=False) for item in collected_items]
        file.write('\n'.join(json_data) + '\n')
    print("Results saved successfully.")

# 随机延时函数,用来模拟动作比较快的点击操作
def random_delay(time_start, time_end):
    delay = random.uniform(time_start, time_end)
    time.sleep(delay)

# 配置 Chrome 选项
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(executable_path=chrome_driver_path), options=options)
driver.maximize_window()

def scroll_and_collect(driver, num_pages):
    collected_items = []
    collected_count = 0
    current_page = 0

    while current_page <= num_pages:
        current_page += 1
        if current_page > 1:
            pager_item = driver.find_element(By.ID, "iptGotoNum")  # 翻页
            pager_item.clear().send_keys(current_page)
            driver.find_element(By.XPATH, ".//div[@class='go-to']/a[@class='confirm-btn']").click()

            random_delay(4, 8)  # 等待页面加载新的内容
        print(current_page)
        # 获取了页面上的所有选项卡
        #items = driver.find_elements(By.XPATH, "//div[@class=' tk-quest-item  quesroot  ']")
        #items = driver.find_element(By.CSS_SELECTOR, ".tk-quest-item.quesroot")
        items = driver.find_elements(By.CLASS_NAME, "tk-quest-item")
        # 遍历获取的列表,分析里面的元素
        for item in items:
            try:
                # 找到元素里封面、标题、作者昵称、作者头像等元素
                type = item.find_element(By.XPATH, ".//div[@class='ques-additional']").find_elements(By.CLASS_NAME, "info-cnt")[0].text
                difficulty = item.find_element(By.XPATH, ".//div[@class='ques-additional']").find_elements(By.CLASS_NAME, "info-cnt")[1].text
                content = item.find_element(By.XPATH, ".//div[@class='wrapper quesdiv']/div[@class='exam-item__cnt ']").get_attribute("outerHTML")

                ##点击显示答案
                #answer_item_box = item.find_element(By.XPATH, ".//div[@class='wrapper quesdiv']")
                #answer_item_box.click()
                #random_delay(2, 10)  # 等待页面加载
                #answer_item = item.find_element(By.XPATH, ".//div[@class='wrapper quesdiv']/div[@class='exam-item__opt']/div[@class='item answer']")
                #answer_text = answer_item.text
                #if answer_text.find("升级会员")>0:
                #    answer = ""
                #else:
                #    answer = answer_item.get_attribute("outerHTML")

                # 存储获取的结果
                print({
                    "type": type,
                    "difficulty": difficulty,
                    "content": content,
                    #"answer": answer
                })
                collected_items.append({
                    "type": type,
                    "difficulty": difficulty,
                    "content": content,
                    #"answer": answer
                })
                collected_count += 1
                # 写入文件
                if collected_count >= MAX_ITEMS_BEFORE_WRITING:
                    write_to_file(collected_items)
                    collected_items = []  # 清空已收集的项
                    collected_count = 0  # 重置计数器
            except NoSuchElementException:
                continue

    # 最后一次写入剩余的项
    if collected_count > 0:
        write_to_file(collected_items)
    return collected_items

#关闭弹窗广告
def close_win(browser):
    time.sleep(10)
    try:
        closewindow = browser.find_element_by_class_name('next-dialog-close')
        browser.execute_script("arguments[0].click();", closewindow)
    except Exception as e:
        print(f"searchKey: there is no suspond Page1. e = {e}")

try:
    # 打开主页
    driver.get("https://zujuan.xkw.com/gzsx/zsd27925")
    random_delay(5, 10)  # 等待页面加载

    # 从文件加载 cookie
    with open("cookies.json", "r") as file:
        cookies = json.load(file)

    # 注入 cookie
    for cookie in cookies:
        driver.add_cookie(cookie)
    random_delay(2, 6)
    driver.refresh()  # 刷新页面以加载注入的 cookie
    random_delay(5, 10)

    # 显式等待,直到搜索框出现
    wait = WebDriverWait(driver, 10)

    #关闭广告
    #close_win(driver)

    wait.until(EC.presence_of_element_located((By.XPATH, "//section[@class='test-list']")))

    # 获取前100个内容的封面、标题、作者头像和昵称
    num_pages = 10
    scroll_and_collect(driver, num_pages)

    # 延时几秒以便查看搜索结果
    time.sleep(60)
finally:
    driver.quit()

 

原神测试

import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import NoSuchElementException
import random
import time

# 设置 ChromeDriver 路径
chrome_driver_path = 'F:/python/spider/chromedriver-win64/chromedriver.exe'

MAX_ITEMS_BEFORE_WRITING = 20  # 每收集 20 条数据就写入一次文件

def write_to_file(collected_items):
    # 追加写入文件的逻辑
    with open('result.json', 'a', encoding='utf-8') as file:
        json_data = [json.dumps(item, ensure_ascii=False) for item in collected_items]
        file.write('\n'.join(json_data) + '\n')
    print("Results saved successfully.")

# 随机延时函数,用来模拟动作比较快的点击操作
def random_delay(time_start, time_end):
    delay = random.uniform(time_start, time_end)
    time.sleep(delay)

# 配置 Chrome 选项
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(executable_path=chrome_driver_path), options=options)
driver.maximize_window()

def scroll_and_collect(driver, num_items):
    collected_items = []
    collected_count = 0
    result_count = 0
    while result_count < num_items:
        # 获取了页面上的所有选项卡
        items = driver.find_elements(By.XPATH, "//section[@class='note-item']")
        # 遍历获取的列表,分析里面的元素
        for item in items:
            try:
                # 找到元素里封面、标题、作者昵称、作者头像等元素
                cover = item.find_element(By.XPATH, ".//div/a[@class='cover mask ld']/img").get_attribute("src")
                title = item.find_element(By.XPATH, ".//div[@class='footer']/a[@class='title']/span").text
                author_avatar = item.find_element(By.XPATH, ".//div[@class='card-bottom-wrapper']/a[@class='author']/img").get_attribute("src")
                author_name = item.find_element(By.XPATH, ".//div[@class='card-bottom-wrapper']/a[@class='author']/div/div[@class='name']/span[@class='name']").text

                # 存储获取的结果
                collected_items.append({
                    "cover": cover,
                    "title": title,
                    "author_avatar": author_avatar,
                    "author_name": author_name
                })
                result_count += 1
                collected_count += 1
                # 写入文件
                if collected_count >= MAX_ITEMS_BEFORE_WRITING:
                    write_to_file(collected_items)
                    collected_items = []  # 清空已收集的项
                    collected_count = 0  # 重置计数器
            except NoSuchElementException:
                continue
        # 翻页
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        random_delay(4, 8)  # 等待页面加载新的内容

        # 等待新内容加载的逻辑
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//section[@class='note-item']"))
            )
        except NoSuchElementException:
            break
        # 最后一次写入剩余的项
    if collected_count > 0:
        write_to_file(collected_items)
    return collected_items

try:
    # 打开小红书主页
    driver.get("https://www.xiaohongshu.com")
    random_delay(5, 10)  # 等待页面加载

    # 从文件加载 cookie
    with open("cookies.json", "r") as file:
        cookies = json.load(file)

    # 注入 cookie
    for cookie in cookies:
        driver.add_cookie(cookie)
    random_delay(2, 6)
    driver.refresh()  # 刷新页面以加载注入的 cookie
    random_delay(5, 10)

    # 显式等待,直到搜索框出现
    wait = WebDriverWait(driver, 10)
    search_box = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@placeholder='搜索小红书']")))
    search_box.send_keys("原神")
    random_delay(2, 5)
    search_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='input-button']")))
    search_button.click()
    random_delay(2, 5)
    wait.until(EC.presence_of_element_located((By.XPATH, "//section[@class='note-item']")))

    # 获取前100个内容的封面、标题、作者头像和昵称
    num_items = 10
    scroll_and_collect(driver, num_items)

    # 延时几秒以便查看搜索结果
    time.sleep(60)
finally:
    driver.quit()

 

 

 

c# Selenium.WebDriver.ChromeDriver, Selenium.WebDriver ,Selenium.Support, Selenium.Chrome.WebDriver

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using OpenQA.Selenium.Chrome;

namespace WindowsFormsBroswer
{
    /// <summary>
    /// https://www.cnblogs.com/zhaotianff/p/11330810.html selenium操作
    /// https://www.nowcoder.com/discuss/746291688196296704
    /// https://zhuanlan.zhihu.com/p/27397132773 selenium截图
    /// https://blog.csdn.net/c_xiazai12345/article/details/120654809 窗口切换
    /// https://www.yisu.com/jc/556716.html CefSharp替换WebBrowser控件
    /// https://www.cnblogs.com/mq0036/p/11059644.html
    /// </summary>
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void crawlingWebFunc()
        {
            SetText("\r\n开始尝试...");
            List<testfold> surls = new List<testfold>();
            string path = System.Environment.CurrentDirectory + "\\图片url\\";
            DirectoryInfo root = new DirectoryInfo(path);
            DirectoryInfo[] dics = root.GetDirectories();
            foreach (var itemdic in dics)
            {
                string txt = "";
                StreamReader sr = new StreamReader(itemdic.FullName + "\\data.txt");
                while (!sr.EndOfStream)
                {
                    string str = sr.ReadLine();
                    txt += str;// + "\n";
                }
                sr.Close();
                surls.Add(new testfold() { key = itemdic.FullName, picurl = txt });
            }

            ChromeDriverService service = ChromeDriverService.CreateDefaultService(System.Environment.CurrentDirectory);
            //  service.HideCommandPromptWindow = true;

            ChromeOptions options = new ChromeOptions();
            options.AddArguments("--test-type", "--ignore-certificate-errors");
            options.AddArgument("enable-automation");
            //   options.AddArgument("headless");
            //  options.AddArguments("--proxy-server=http://user:password@yourProxyServer.com:8080");

            using (IWebDriver driver = new OpenQA.Selenium.Chrome.ChromeDriver(service, options, TimeSpan.FromSeconds(120)))
            {
                driver.Url = "https://www.1688.com/";
                Thread.Sleep(200);
                try
                {
                    int a = 1;
                    foreach (var itemsurls in surls)
                    {
                        SetText("\r\n第" + a.ToString() + "个");
                        driver.Navigate().GoToUrl(itemsurls.picurl);
                        //登录
                        if (driver.Url.Contains("login.1688.com"))
                        {
                            SetText("\r\n需要登录,开始尝试...");
                            trylogin(driver); //尝试登录完成
                                              //再试试
                            driver.Navigate().GoToUrl("https://s.1688.com/youyuan/index.htm?tab=imageSearch&imageType=oss&imageAddress=cbuimgsearch/eWXC7XHHPN1607529600000&spm=");

                            if (driver.Url.Contains("login.1688.com"))
                            {
                                //没办法退出
                                SetText("\r\n退出,换ip重试...");
                                return;
                            }
                        }

                        //鼠标放上去的内容因为页面自带只能显示一个的原因 没办法做到全部显示 然后在下载 只能是其他方式下载
                        //  var elements = document.getElementsByClassName('hover-container');
                        //  Array.prototype.forEach.call(elements, function(element) {
                        //  element.style.display = "block";
                        //   console.log(element);
                        //  });

                        //   IJavaScriptExecutor js = (IJavaScriptExecutor)driver;

                        //    var sss = js.ExecuteScript(" var elements = document.getElementsByClassName('hover-container');  Array.prototype.forEach.call(elements, function(element) {  console.log(element); element.setAttribute(\"class\", \"测试title\");  element.style.display = \"block\";  console.log(element); });");

                        Thread.Sleep(500);
                        var responseModel = Write(itemsurls.key, driver.PageSource, Pagetypeenum.列表);
                        Thread.Sleep(500);
                        int i = 1;
                        foreach (var offer in responseModel?.data?.offerList ?? new List<OfferItemModel>())
                        {
                            driver.Navigate().GoToUrl(offer.information.detailUrl);
                            string responseDatadetail = driver.PageSource;
                            Write(itemsurls.key, driver.PageSource, Pagetypeenum.详情);
                            SetText("\r\n第" + a.ToString() + "-" + i.ToString() + "个");
                            Thread.Sleep(500);
                            i++;
                        }
                    }
                }
                catch (Exception ex)
                {
                    CloseChromeDriver(driver);
                    throw;
                }
            }
        }

        #region 异常  退出chromedriver

        [DllImport("user32.dll", EntryPoint = "FindWindow")]
        private extern static IntPtr FindWindow(string lpClassName, string lpWindowName);

        [DllImport("user32.dll", EntryPoint = "SendMessage")]
        public static extern int SendMessage(IntPtr hWnd, int Msg, int wParam, int lParam);

        public const int SW_HIDE = 0;
        public const int SW_SHOW = 5;

        [DllImport("user32.dll", EntryPoint = "ShowWindow")]
        public static extern int ShowWindow(IntPtr hwnd, int nCmdShow);

        /// <summary>
        /// 获取窗口句柄
        /// </summary>
        /// <returns></returns>
        public IntPtr GetWindowHandle()
        {
            string name = (Environment.CurrentDirectory + "\\chromedriver.exe");
            IntPtr hwd = FindWindow(null, name);
            return hwd;
        }

        /// <summary>
        /// 关闭chromedriver窗口
        /// </summary>
        public void CloseWindow()
        {
            try
            {
                IntPtr hwd = GetWindowHandle();
                SendMessage(hwd, 0x10, 0, 0);
            }
            catch { }
        }

        /// <summary>
        /// 退出chromedriver
        /// </summary>
        /// <param name="driver"></param>
        public void CloseChromeDriver(IWebDriver driver)
        {
            try
            {
                driver.Quit();
                driver.Dispose();
            }
            catch { }
            CloseWindow();
        }

        #endregion 异常  退出chromedriver

    }
}

 

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace WindowsFormsBroswer
{
    public partial class Form2 : Form
    {
        public Form2()
        {
            InitializeComponent();

            WebBrowser webBrowser = new WebBrowser();
            webBrowser.Navigate("https://www.baidu.com/");

            webBrowser.DocumentCompleted += WebBrowser_DocumentCompleted;
            //Application.Run( );
            Console.WriteLine("结束");

        }

        private static void WebBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            try
            {
                MessageBox.Show("加载成功");
            }
            finally
            {
                webBrowser.DocumentCompleted -= WebBrowser_DocumentCompleted;
            }
        }

    }
}

 

 

转载请注明:SuperIT » 抓数据

喜欢 (0)or分享 (0)

您必须 登录 才能发表评论!