review-scrapper / app.py
app.py
Raw
from flask import Flask, render_template, jsonify, request
from flask_cors import CORS, cross_origin  # to avoid cross domain errors
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as bs
# import pymongo

# Initialize the flask app
app = Flask(__name__)
# CORS(app)

# Display home page
@app.route('/', methods=['GET'])
# @cross_origin   # to avoid cross domain errors
def homepage():
	return render_template("index.html")

# Scrape and show results
@app.route('/reviews', methods=['POST'])
def index():
	if request.method == "POST":
		searchRes = request.form['content']  # get search string
		searchStr = searchRes.replace(' ','') # remove spaces
		try:
			# # Create a DB connection to MongoDB
			# dbConn = pymongo.MongoClient("mongodb://localhost:27017/")
			# # Create/Connect to a DB: crawlerDB
			# db = dbConn['crawlerDB']
			# # Search for a collection in DB with the name of search string
			# reviews = db[searchStr].find({})
			# # if collection exists and it has records,
			# if reviews.count() > 0:
			# 	# return the results as html with reviews in it
			# 	return render_template('results.html', reviews=reviews)
			#
			# else:
			# Create product search url
			srchURL = "https://www.flipkart.com/search?q=" + searchStr
			# Request the search results web page
			uClient = uReq(srchURL)
			# Read the page
			srchPage = uClient.read()
			# Close the connection
			uClient.close()
			# Parse the page to HTML using beautiful soup
			srch_html = bs(srchPage, 'html.parser')
			# Find the div class tag for all products on the page
			prod_boxes = srch_html.find_all('div', {"class" : "_1AtVbE col-12-12"})
			# Ignore the first 2-3 products
			del prod_boxes[0:3]
			# Select a product for demo
			prod = prod_boxes[0]
			# Extract the product link
			prodURL = "https://www.flipkart.com" + prod.div.div.div.a['href']
			# Extract full product name
			prodName = srch_html.find_all('div', {"class" : "_4rR01T"})[0].text
			# Open the product page
			prodRes = requests.get(prodURL)
			# Parse product page as HTML
			prod_html = bs(prodRes.text, "html.parser")
			# Find sections (class tag) with customer reviews
			comments = prod_html.find_all('div', {'class':'_16PBlm'})
				# Create a collection (table) in the DB with name of search string
				# table = db[searchStr]
			# create an empty list of reviews
			reviews = []
			# Iterate over each review and for each review
			for comment in comments:
				try:
					# Find class tag containing name
					name = comment.div.div.find_all('p', {'class':'_2sc7ZR _2V5EHH'})[0].text
				except:
					name = "Not available"
				try:
					# Find tag with rating
					rating = comment.div.div.div.div.text
				except:
					rating = "Not available"
				try:
					# Find tag with Headline
					commentHead = comment.div.div.div.p.text
				except:
					commentHead = "Not Available"

				try:
					# Find tag with comment text
					commTag = comment.div.div.find_all('div', {'class':''})
					custComm = commTag[0].div.text
				except:
					custComm = "Not Available"

				# Save the details in a dict
				reviewDict = {'Product Name':prodName,
			                  'Link':prodURL,
			                  # "Product":searchRes,
				              "Buyer Name":name,
				              "Rating":rating,
				              "CommentHead":commentHead,
				              "Comment":custComm}

					# # Add record to the DB collection
					# table.insert_one(reviewDict)
				# Append record to reviews list
				reviews.append(reviewDict)
			# Return results by rendering result HTML
			return render_template('results.html', reviews=reviews[0:(len(reviews)-1)])

		except:
			# Return something is wrong
			error = "Oops... something went wrong. " \
			        "Please search for a different product or try one of the keywords mentioned above."
			print(error)
			return render_template('index.html', error=error)
	#
	# else:
	# 	return render_template('index.html')

if __name__ == "__main__":
	app.run(debug=True)

# App Link on Heroku: https://protected-island-35343.herokuapp.com/