steam-review-scraper / steam_review_scraper / steam.py
steam.py
Raw
"""
Functions allowing us to interact with and gather
data from the steam servers. This data is then saved
to a db (steam.db) in the project root folder.

Attributes:
    STEAM_REVIEW_SORT_FILTERS (lst(str)): The different steam api sort modes.
"""

import datetime
import logging
import json
import urllib
import re
import os
import time
import sys

import common
import db_common
import review_model


STEAM_REVIEW_SORT_FILTERS = (
    "recent",
    "updated",
    "all"
)


class ConnectionError(Exception):
    """Error indicating we were unable to communicate with steam servers."""
    pass


def parse_reviews_for_app(appid):
    """
    Gathers game info and reviews for a given appid.

    Args:
        appid (int): App/Game id.

    Returns:
        int: Number of requested reviews.
    """
    appinfo = get_steam_game_info(appid)
    app_name = appinfo["name"]

    logging.info("Retrieving and parsing reviews for '{0}' ({1}) {2}...".format(
        app_name, appid, STEAM_REVIEW_SORT_FILTERS[0]))

    db_common.insert_or_update_app(appid, app_name)

    languages = common.get_settings().get_tracked_languages()
    num_added_reviews = review_parse_loop(appid, languages, STEAM_REVIEW_SORT_FILTERS[0])

    db_common.insert_or_update_languages(languages)

    logging.info("---------------------------")
    logging.info("Added in total {} reviews".format(num_added_reviews))
    logging.info("---------------------------")

    return num_added_reviews


def get_steam_game_info(appid):
    """ Query Steam api for game data.

    Args:
        appid (int): App/Game id.

    Returns:
        dict: Parsed json game data.
    """
    url = "http://store.steampowered.com/api/appdetails?appids={}".format(appid)

    response = urllib.urlopen(url)
    response_code = response.getcode()
    response_content = response.read()
    if response_code == 200:
        data = json.loads(response_content)
        if str(appid) in data.keys():
            game_data = data.get(str(appid)).get("data")
            if not game_data:
                raise ValueError("Provided appid ({}) is not a valid steam id.".format(appid))

            return game_data
    else:
        raise ConnectionError(
            "Could not contact steam api. Response code is {}".format(response_code))


def review_parse_loop(appid, languages, sort_by):
    """ Main parse review loop.

    Queries the Steam api for new reviews then stores/updates
    the sqlite db with the new data.

    Args:
        appid (int): App/Game id.
        languages (lst(Language)): List of language objects.
        sort_by (str): Steam review sort filter, ex 'all'.

    Returns:
        int: Number of gathered reviews.
    """
    current_cursor = "*"
    seen_cursors = []

    language_keys = [lang.steam_key for lang in languages]
    total_reviews = "Unknown"
    num_added = 0
    percent = 0

    updated_time = int(time.time())

    while True:
        reviews, current_cursor, t = get_reviews_from_api(
            appid, language_keys, 100, sort_by, current_cursor)
        num_added = num_added + len(reviews)

        if t is not None:
            total_reviews = t

        if total_reviews > 0:
            percent = round((float(num_added) / float(total_reviews)) * 100)

        db_common.insert_or_update_reviews(
            reviews,
            updated_time,
            include_user_input_columns=False
        )

        if num_added % 1000 == 0:
            if os.getenv("scraper_show_progressbar", "0") == "1":
                sys.stdout.write("\n")

            logging.info("{0}%: {1}/{2} reviews saved to db".format(
                percent, num_added, total_reviews)
            )

        if os.getenv("scraper_show_progressbar", "0") == "1":
            sys.stdout.write("\r %d%% [%-100s] %d/%d reviews saved to db" % (
                percent, "="*int(percent), num_added, total_reviews)
            )
            sys.stdout.flush()

        if current_cursor in seen_cursors:
            logging.info("breaking on seen cursor {}. No more reviews to add".format(current_cursor))
            break

        if current_cursor != "*":
            seen_cursors.append(current_cursor)

    return num_added


def get_reviews_from_api(
        steam_appid,
        languages=[],
        num_per_page=20,
        filter=STEAM_REVIEW_SORT_FILTERS[-1],
        cursor="*"):
    """Query the Steam api for reviews.

    Args:
        steam_appid (int): The game/app id.
        languages (lst(str)): Languages in format ex 'english'.
        num_per_page (int): Page count. Will be the number of returned reviews. Max 100 as per Steam limits.
        filter (str): Filter, ex 'all'.
        cursor (str): Current cursor. New cursor is returned after every request.

    Returns:
        (lst(SteamReview), str, int): Requested reviews, new cursor and total review count.
    """

    delta = datetime.datetime.now().date() - datetime.date(1993, 1, 1)

    options = {
        "json":"1",
        "cursor": cursor,
        "language":"all" if len(languages) == 0 else ",".join(languages),
        "filter":filter,
        "review_type":"all",
        "purchase_type":"all",
        "num_per_page":num_per_page,
        "day_range": delta.days
    }

    reviews = []
    url = "http://store.steampowered.com/appreviews/{0}?json=1&{1}".format(
        steam_appid, urllib.urlencode(options))

    response = urllib.urlopen(url)
    response_code = response.getcode()
    response_content = response.read()

    response_cursor = None
    total_reviews = None

    if response_code == 200:
        response_data = json.loads(response_content)
        reviews_data = response_data["reviews"]

        if "query_summary" in response_data:
            total_reviews = response_data["query_summary"].get("total_reviews", None)

        if "cursor" in response_data:
            response_cursor = response_data["cursor"]

        for review in reviews_data:
            if not review:
                continue

            review_id = review["recommendationid"]

            if languages and review["language"] not in languages:
                logging.info("Skipping review {0}, {1} not in language list".format(
                    review_id, review["language"]))
                continue

            output = construct_steamreview(steam_appid, review)
            reviews.append(output)
    else:
        raise ConnectionError("Could not contact steam api. Response code is {}".format(response_code))

    return (reviews, response_cursor, total_reviews)


def construct_steamreview(steam_appid, review):
    """ Construct and return a new SteamReview.

    Args:
        steam_appid (int): App/Game id.
        review (dict): Parsed json review object.

    Returns:
        SteamReview: New steam review object.
    """

    recommendation_id = review["recommendationid"]
    recommended = review["voted_up"]
    hours_played = review["author"]["playtime_forever"]
    helpful_amount = review["votes_up"]
    helpful_total = review["votes_up"] + review["votes_funny"]
    games_owned = review["author"]["num_games_owned"]
    early_access_review = review["written_during_early_access"]
    lang_key = review["language"]
    received_compensation = review["received_for_free"]
    review_url = "https://steamcommunity.com/profiles/{}/recommended/{}".format(
        review["author"]["steamid"], steam_appid
    )
    user_name = review["author"]["steamid"]
    user_link = "http://steamcommunity.com/profiles/{}".format(
        review["author"]["steamid"]
    )
    date_posted = datetime.datetime.fromtimestamp(
        review.get("timestamp_created", 0)
    )
    date_updated = datetime.datetime.fromtimestamp(
        review.get("timestamp_updated", 0)
    )
    content = review.get("review", "")
    responded_by = review.get("developer_response", None)

    responded_date = None
    responded_date_str = review.get("timestamp_dev_responded", None)
    if responded_date_str is not None:
        responded_date = datetime.datetime.fromtimestamp(
            review.get("timestamp_dev_responded", None)
        )

    return review_model.SteamReview(
        recommendation_id,
        review_url,
        steam_appid,
        recommended=recommended,
        hours_played=hours_played,
        helpful_amount=helpful_amount,
        helpful_total=helpful_total,
        games_owned=games_owned,
        early_access_review=early_access_review,
        lang_key=lang_key,
        received_compensation=received_compensation,
        user_name=user_name,
        user_link=user_link,
        date_posted=date_posted,
        date_updated=date_updated,
        content=content,
        responded_by=responded_by,
        responded_date=responded_date
    )


def remove_deleted_reviews(steam_appid, compare_time):
    """
    Remove reviews not updated after last sync.

    Args:
        steam_appid (int): App/Game id.
        compare_time (int): Epoch time of before last update.
    """

    languages =  common.get_settings().get_tracked_languages()
    language_keys = [lang.steam_key for lang in languages]

    logging.info("Checking for deleted reviews (for {0}). Languages: {1}".format(
        steam_appid, ",".join(language_keys)))

    num_deleted = db_common.delete_all_unchanged_reviews(
        steam_appid,
        language_keys,
        compare_time
    )
    logging.info("Deleted {} reviews".format(num_deleted))