review-scrapper / local_flask_app.py
local_flask_app.py
Raw
from flask import Flask, render_template, jsonify, request
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as bs
import pymongo

# Initialize the flask app
app = Flask(__name__)

# Display home page
@app.route('/', methods=['GET'])
def homepage():
	return render_template("index.html")

# Scrape and show results
@app.route('/reviews', methods=['GET','POST'])
def index():
	if request.method == "POST":
		searchRes = request.form['content']  # get search string
		searchStr = searchRes.replace(' ','') # remove spaces
		try:
			# Create a DB connection to MongoDB
			dbConn = pymongo.MongoClient("mongodb://localhost:27017/")
			# Create/Connect to a DB: crawlerDB
			db = dbConn['crawlerDB']
			# Search for a collection in DB with the name of search string
			reviews = db[searchStr].find({})
			# if collection exists and it has records,
			if reviews.count() > 0:
				# return the results as html with reviews in it
				return render_template('results.html', reviews=reviews)

			else:
				# Create product search url
				srchURL = "https://www.flipkart.com/search?q=" + searchStr
				# Request the search results web page
				uClient = uReq(srchURL)
				# Read the page
				srchPage = uClient.read()
				# Close the connection
				uClient.close()
				# Parse the page to HTML using beautiful soup
				srch_html = bs(srchPage, 'html.parser')
				# Find the div class tag for all products on the page
				prod_boxes = srch_html.find_all('div', {"class" : "_1AtVbE col-12-12"})
				# Ignore the first 2-3 products
				del prod_boxes[0:3]
				# Select a product for demo
				prod = prod_boxes[0]
				# Extract the product link
				prodURL = "https://www.flipkart.com" + prod.div.div.div.a['href']
				# Extract full product name
				prodName = srch_html.find_all('div', {"class" : "_4rR01T"})[0].text
				# Open the product page
				prodRes = requests.get(prodURL)
				# Parse product page as HTML
				prod_html = bs(prodRes.text, "html.parser")
				# Find sections (class tag) with customer reviews
				comments = prod_html.find_all('div', {'class':'_16PBlm'})
				# Create a collection (table) in the DB with name of search string
				table = db[searchStr]
				# create an empty list of reviews
				reviews = []
				# Iterate over each review and for each review
				for comment in comments:
					try:
						# Find class tag containing name
						name = comment.div.div.find_all('p', {'class':'_2sc7ZR _2V5EHH'})[0].text
					except:
						name = "Not available"
					try:
						# Find tag with rating
						rating = comment.div.div.div.div.text
					except:
						rating = "Not available"
					try:
						# Find tag with Headline
						commentHead = comment.div.div.div.p.text
					except:
						commentHead = "Not Available"

					try:
						# Find tag with comment text
						commTag = comment.div.div.find_all('div', {'class':''})
						custComm = commTag[0].div.text
					except:
						custComm = "Not Available"

					# Save the details in a dict
					reviewDict = {'Product Name':prodName,
				                  'Link':prodURL,
				                  # "Product":searchRes,
					              "Buyer Name":name,
					              "Rating":rating,
					              "CommentHead":commentHead,
					              "Comment":custComm}

					# Add record to the DB collection
					table.insert_one(reviewDict)
					# Append record to reviews list
					reviews.append(reviewDict)
				# Return results by rendering result HTML
				return render_template('results.html', reviews=reviews[0:len(reviews)-1])

		except:
			# Return something is wrong
			print('Something went wrong...')

	# else:
	# 	return render_template('index.html')

if __name__ == "__main__":
	app.run(port=8000, debug=True)