MyProjects / Oefeningen / Antoniemen.py
Antoniemen.py
Raw
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
import time


def requests_retry_session(
        retries=10,
        backoff_factor=0.3,
        status_forcelist=(500, 502, 504),
        session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


url = "https://www.mijnwoordenboek.nl/antoniem.php"

payload = {}
headers = {
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document'
}

# response = requests.request("GET", url, headers=headers, data=payload)
response = requests_retry_session().get(url, headers=headers, data=payload)

# print(response.text.encode('utf8'))

letterUrlPattern = 'https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/[A-Z]\/1\.html'
wordUrlPattern = '\"https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/([ A-Za-z\u00F0-\u02AF-\u00cf-\u00eb\)\.]+)'
wordAntPattern = '\"https:\/\/www\.mijnwoordenboek\.nl\/antoniemen\/([ A-Za-z\u00F0-\u02AF-\u00cf-\u00eb\)\.]+)'
baseUrl = 'https://www.mijnwoordenboek.nl/antoniemen/'

letterUrls = re.findall(letterUrlPattern, response.text)
letterUrls = list(set(letterUrls))
print("Found " + str(len(letterUrls)) + " letter pages!")

words = []

for lUrl in letterUrls:
    # response = requests.request("GET", lUrl, headers=headers, data=payload)
    response = requests_retry_session().get(lUrl, headers=headers, data=payload)
    for urlFind in re.findall(wordUrlPattern, response.text):
        words.append(urlFind)

words = list(set(words))
words.sort()
print("Found " + str(len(words)) + " total words!")
for word in words:
    #  response = requests.request("GET", baseUrl + word, headers=headers, data=payload)
    response = requests_retry_session().get(baseUrl + word, headers=headers, data=payload)
    try:
        ant = re.findall(wordAntPattern, response.text)[0]
        # ant = re.search(wordAntPattern, response.text)
    except IndexError:
        print("[ERROR] " + word)
        continue
    print(word.replace(")", "") + ":" + ant)