from flask import Flask, render_template, jsonify, request import requests from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as bs import pymongo # Initialize the flask app app = Flask(__name__) # Display home page @app.route('/', methods=['GET']) def homepage(): return render_template("index.html") # Scrape and show results @app.route('/reviews', methods=['GET','POST']) def index(): if request.method == "POST": searchRes = request.form['content'] # get search string searchStr = searchRes.replace(' ','') # remove spaces try: # Create a DB connection to MongoDB dbConn = pymongo.MongoClient("mongodb://localhost:27017/") # Create/Connect to a DB: crawlerDB db = dbConn['crawlerDB'] # Search for a collection in DB with the name of search string reviews = db[searchStr].find({}) # if collection exists and it has records, if reviews.count() > 0: # return the results as html with reviews in it return render_template('results.html', reviews=reviews) else: # Create product search url srchURL = "https://www.flipkart.com/search?q=" + searchStr # Request the search results web page uClient = uReq(srchURL) # Read the page srchPage = uClient.read() # Close the connection uClient.close() # Parse the page to HTML using beautiful soup srch_html = bs(srchPage, 'html.parser') # Find the div class tag for all products on the page prod_boxes = srch_html.find_all('div', {"class" : "_1AtVbE col-12-12"}) # Ignore the first 2-3 products del prod_boxes[0:3] # Select a product for demo prod = prod_boxes[0] # Extract the product link prodURL = "https://www.flipkart.com" + prod.div.div.div.a['href'] # Extract full product name prodName = srch_html.find_all('div', {"class" : "_4rR01T"})[0].text # Open the product page prodRes = requests.get(prodURL) # Parse product page as HTML prod_html = bs(prodRes.text, "html.parser") # Find sections (class tag) with customer reviews comments = prod_html.find_all('div', {'class':'_16PBlm'}) # Create a collection (table) in the DB with name of search string table = db[searchStr] # create an empty list of reviews reviews = [] # Iterate over each review and for each review for comment in comments: try: # Find class tag containing name name = comment.div.div.find_all('p', {'class':'_2sc7ZR _2V5EHH'})[0].text except: name = "Not available" try: # Find tag with rating rating = comment.div.div.div.div.text except: rating = "Not available" try: # Find tag with Headline commentHead = comment.div.div.div.p.text except: commentHead = "Not Available" try: # Find tag with comment text commTag = comment.div.div.find_all('div', {'class':''}) custComm = commTag[0].div.text except: custComm = "Not Available" # Save the details in a dict reviewDict = {'Product Name':prodName, 'Link':prodURL, # "Product":searchRes, "Buyer Name":name, "Rating":rating, "CommentHead":commentHead, "Comment":custComm} # Add record to the DB collection table.insert_one(reviewDict) # Append record to reviews list reviews.append(reviewDict) # Return results by rendering result HTML return render_template('results.html', reviews=reviews[0:len(reviews)-1]) except: # Return something is wrong print('Something went wrong...') # else: # return render_template('index.html') if __name__ == "__main__": app.run(port=8000, debug=True)