import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import re import time def requests_retry_session( retries=10, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None, ): session = session or requests.Session() retry = Retry( total=retries, read=retries, connect=retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session url = "https://www.mijnwoordenboek.nl/antoniem.php" payload = {} headers = { 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document' } # response = requests.request("GET", url, headers=headers, data=payload) response = requests_retry_session().get(url, headers=headers, data=payload) # print(response.text.encode('utf8')) letterUrlPattern = 'https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/[A-Z]\/1\.html' wordUrlPattern = '\"https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/([ A-Za-z\u00F0-\u02AF-\u00cf-\u00eb\)\.]+)' wordAntPattern = '\"https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/([ A-Za-z\u00F0-\u02AF-\u00cf-\u00eb\)\.]+)' baseUrl = 'https://www.mijnwoordenboek.nl/antoniemen/' letterUrls = re.findall(letterUrlPattern, response.text) letterUrls = list(set(letterUrls)) print("Found " + str(len(letterUrls)) + " letter pages!") words = [] for lUrl in letterUrls: # response = requests.request("GET", lUrl, headers=headers, data=payload) response = requests_retry_session().get(lUrl, headers=headers, data=payload) for urlFind in re.findall(wordUrlPattern, response.text): words.append(urlFind) words = list(set(words)) words.sort() print("Found " + str(len(words)) + " total words!") for word in words: # response = requests.request("GET", baseUrl + word, headers=headers, data=payload) response = requests_retry_session().get(baseUrl + word, headers=headers, data=payload) try: ant = re.findall(wordAntPattern, response.text)[0] # ant = re.search(wordAntPattern, response.text) except IndexError: print("[ERROR] " + word) continue print(word.replace(")", "") + ":" + ant)