使用 Python/Selenium,我定义了一个休眠装饰器,在 Web 抓取项目的 5 个函数调用(对服务器的请求)之间等待 20 秒,这样我就不会压垮他们的服务器。查看终端输出,它似乎按照我的预期工作,但是当我在创建输出文件“Hitachi.csv”时观察它时,它似乎没有在第五个网址处暂停,而是在最后,引导我相信昏昏欲睡的装饰器在第五次调用时不会暂停。请帮忙:)
def sleepy(f):
def wrapped(*args, **kwargs):
wrapped.calls += 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 5 == 0:
print("Sleeping...")
sleep(20)
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
# script_concurrent.py
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os
from scrapers.scraper import connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls += 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
wrapped.calls += 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 5 == 0:
print("Sleeping...")
sleep(20)
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename="Hitachi.csv"):
# init browser
os.environ["WDM_LOG_LEVEL"] = "0"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print("Error connecting to AVS")
# exit
browser.quit()
if __name__ == "__main__":
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"output_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(executor.submit(run_process) for _ in range(2, 12))
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# scraper.py
import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts += 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename="Hitachi.csv"):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
run_process called 1 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,433 INFO ====== WebDriver manager ======
run_process called 2 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,439 INFO ====== WebDriver manager ======
run_process called 3 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,440 INFO ====== WebDriver manager ======
run_process called 4 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,450 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
run_process called 6 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,461 INFO ====== WebDriver manager ======
run_process called 7 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,467 INFO ====== WebDriver manager ======
run_process called 8 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,477 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,690 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,690 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,720 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,720 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,733 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,789 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,790 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,793 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,793 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,798 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,798 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,807 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,807 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,868 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,909 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,946 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,974 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,007 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,016 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,038 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:19,459 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:19,552 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:19,552 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:19,647 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 9 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:42,827 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:43,131 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:43,131 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:43,745 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 10 times
Sleeping...
Data
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b45/