This is a service to controls headless browser by json requests
This project, which is in continues development, allows you to control and manage most of the important features of a headless browser through a server that receive requests in json and are translated it to Puppeteer actions.
Download docker desktop here and install it.
Download mongodb here and install it.
docker run -d --name YOUR_CONTAINER_NAME -p 27017:27017 -e MONGO_INITDB_ROOT_USERNAME=YOUR_USERNAME -e MONGO_INITDB_ROOT_PASSWORD=YOUR_PASSWORD -v C:/mongodb/cloud_scrapy/mongo:/data/db mongo:5.0.6
Download DragonFlyDB here and install it.
docker run -d --name YOUR_CONTAINER_NAME -p 6379:6379 --ulimit memlock=-1 -v C:/dragonfly/cloud_scrapy/db:/data docker.dragonflydb.io/dragonflydb/dragonfly
Only need to set variable NODE_ENV in (dev, pre, pro) all the names of the config files in the ./config directory
NODE_ENV=dev
NODE_ENV=pre
NODE_ENV=pro
NODE_ENV=YOUR_FILE_NAME
npm install
npm run dev
npm run start
npm run tests
npm run swagger
A full execution json is divided by 4 jsons:
{
"request_description": YOUR_DESCRIPTION,
"send_in_request": {SEE_BELLOW},
"get_in_response": {SEE_BELLOW},
"request_config": {SEE_BELLOW}
}
"request_description": YOUR_DESCRIPTION
"send_in_request": {
"url": "YOUR_URL", "options": {"READ_DOCS"},
"instructions": [
{"command": "goto", "params": ["YOUR_URL"], "options": {"READ_DOCS"}},
{"command": "click", "params": ["SELECTOR"], "options": {"READ_DOCS"}},
{"command": "click_and_wait", "params": ["SELECTOR"], "options": {"READ_DOCS"}},
{"command": "wait_for_selector", "params": ["SELECTOR"], "options": {"READ_DOCS"}},
{"command": "wait_for_selector_and_click", "params": ["SELECTOR"], "options": {"READ_DOCS"}},
{"command": "wait_selector_click_wait_nav", "params": ["SELECTOR"], "options": {"READ_DOCS"}},
{"command": "wait_for_xpath", "params": ["XPATH"], "options": {"READ_DOCS"}},
{"command": "wait_for_function", "params": ["JAVASCRIPT_FUNCTION"], "options": {"READ_DOCS"}},
{"command": "wait_for_navigation", "params": [], "options": {"READ_DOCS"}},
{"command": "evaluate", "params": ["JAVASCRIPT_CODE"], "options": {"READ_DOCS"}},
{"command": "verify", "params": ["TEXT"], "options": {"READ_DOCS"}}, //Verify if a text exist in the current page and return true or false that leads to action required or not
{"command": "xpath", "params": ["XPATH"], "options": {"READ_DOCS"}},
{"command": "type", "params": ["SELECTOR", "TEXT"], "options": {"READ_DOCS"}},
{"command": "sec_type", "params": ["SELECTOR", "ENCRYPTED_TEXT"], "options": {"READ_DOCS"}},
{"command": "keyboard_press", "params": ["KEY_INPUT"], "options": {"READ_DOCS"}},
{"command": "keyboard_down", "params": ["KEY_INPUT"], "options": {"READ_DOCS"}},
{"command": "keyboard_up", "params": ["KEY_INPUT"], "options": {"READ_DOCS"}}
]
}
"get_in_response": {
"cookies": true,
"headers": true,
"html_to_pdf": true,
"logs": {
"active": true,
"full_logs": false
},
"screenshot": {
"active": true,
"full_page": false
},
"source_page": true,
"extract_rules": [
{"name": "YOUR_RULE_NAME", "selector": "SELECTOR", "attribute": "THE_ATTR_YOU_WANT")
]
}
request_config (It is the way in which you configure your request according to your use case.) Example:
Resources: document, stylesheet, image, media, font, script, texttrack, xhr, fetch, eventsource, websocket, manifest, other
"request_config": {
"block_resources": ["RESOURCES"],
"headers": "YOUR_HEADERS",
"cookies": "YOUR_COOKIES",
"captcha": true,
"geolocation": {"latitude": "LATITUDE", "longitude": "LONGITUDE"},
"user_agent": "YOUR_USER_AGENT/OR_CLOUDSCRAPY ASSIGNS RANDOM ONE",
"view_port": {"width": "WIDTH", "height": "HEIGHT"},
"custom_proxy": "YOUR_URI_CUSTOM_PROXY"
}
{
"context_id": "YOUR_CONTEXT_ID_FROM_PREVIOUS_EXECUTION",
"request_id": "YOUR_REQUEST_ID_FROM_PREVIOUS_EXECUTION",
"request_description": "YOUR_DESCRIPTION",
"send_in_request": {THE SAME THAT EXECUTION}
}
cloud-scrapy
├── bin
├── components
├── config
│ └── config_files
│ └── docs
├── config_deploy
│ └── pre
│ └── pro
├── controllers
├── files
│ └── logs
│ └── pdfs
│ └── screenshots
├── middlewares
├── models
├── objects
├── public
│ └── images
├── routes
│ └── v1
├── tests
└── utils
👤 Jose E Cortes
This project is ISC licensed.
Give a ⭐ if this project helped you!