netflix-sql / sql_clean.sql
sql_clean.sql
Raw
/*
    Name: Esther Bergen
    Focus: Use SQL to prepare Netflix data for analysis 
*/


----- Part 1: Creating the Table and Importing the Data -----

CREATE TABLE netflix (
	show_id varchar(5),
	type varchar(10),
	title varchar(255),
	director varchar(255),
	country varchar(30),
	date_added date,
	release_year char(4),
	rating varchar(8),
	duration varchar(10),
	listed_in varchar(100)
);

SET datestyle = US, MDY;

COPY netflix
FROM 'F:\OneDrive\_EU Data Science\660 Database Management\Module 8 (ch 10)\Assignment 6\netflix.csv'
WITH (FORMAT CSV, HEADER);

SELECT * FROM netflix;


----- Part 2: Cleaning -----

/* #1
Created a backup
*/
CREATE TABLE netflix_backup AS SELECT * FROM netflix;

/* #2
I noticed there were thousands of rows with 'Not Given' in the director column.
It would not make sense to lose all of that data, but since the column is not numeric I cannot change the value to something useful like the mean/median.
Instead, I changed the values so they would be recognized by SQL as NULL.
*/
SELECT title, director FROM netflix 
WHERE director = 'Not Given';
--> 2588 rows

UPDATE netflix
SET director = NULL
WHERE director = 'Not Given';
--> Updated 2588 rows


/* #3
The 'country' column also has several unknown values coded as "Not Given," though much fewer than directors. 
Since there were fewer rows affected, I removed the data that did not have value for 'country' and used a transaction to make sure it deleted the expected number of rows.
*/
SELECT title, country FROM netflix
WHERE country = 'Not Given';
--> 287 rows

START TRANSACTION;

DELETE FROM netflix
WHERE country = 'Not Given';
--> Deleted 287

COMMIT;


/* #4
I noticed that the rating column uses both NR and UR, but they mean the same thing (not rated/unrated). 
Since NR is more common in the dataset, I decided to change UR to NR for consistency.
*/
SELECT DISTINCT rating, COUNT(rating) FROM netflix
GROUP BY rating;
--> Shows 79 instances of NR, 3 instances of UR

UPDATE netflix
SET rating = 'NR'
WHERE rating = 'UR'
RETURNING title, rating;
--> 3 rows returned, now NR.


/* #5
While going through the list of titles, I noticed one had spaces before and after the colon. 
I checked the data set and found a total of four occurrences. I replaced ' : ' with ': '.
*/
SELECT show_id, title FROM netflix
WHERE title LIKE '% : %';

UPDATE netflix
SET title = REPLACE(title, ' : ', ': ');


/* #6
The duration column is not useful in its current state. 
It shows '__ min' for movies and '__ Season(s)' for TV shows, each coded as text.
It would be better to make these into numerical values so that we can perform operations on them. 
It also does not make sense to use one column for quantity of seasons AND movie length. The units are not equivalent.
So, I decided to make two new columns (movie_minutes and tv_seasons), populate them with data based on type, and then remove the text so the column type can be changed to integer. 
*/

-- I started a new transaction to protect the data:
START TRANSACTION;

-- I created the two new columns:
ALTER TABLE netflix ADD COLUMN movie_minutes varchar(10);
ALTER TABLE netflix ADD COLUMN tv_seasons varchar(10);

-- I added the tv data to tv_seasons and movie data to movie_minutes:
UPDATE netflix 
SET movie_minutes = duration
WHERE type = 'Movie'
RETURNING title, type, duration, movie_minutes;

UPDATE netflix 
SET tv_seasons = duration
WHERE type = 'TV Show'
RETURNING title, type, duration, tv_seasons;

-- Checking:
SELECT title, type, duration, movie_minutes, tv_seasons FROM netflix;

-- I removed ' min' from the end of the movie_minutes column:
UPDATE netflix
SET movie_minutes = SUBSTRING(movie_minutes, 1, LENGTH(movie_minutes) -4)
WHERE type = 'Movie'
RETURNING title, duration, movie_minutes;

-- tv_seasons has both 'Season' and 'Seasons.' I set the value to 1 when it was just 1 season:
UPDATE netflix
SET tv_seasons = 1
WHERE type = 'TV Show' AND tv_seasons = '1 Season'
RETURNING title, duration, tv_seasons;

-- For the remaining tv_seasons, I removed the ' Seasons' as I did for movie_minutes:
UPDATE netflix
SET tv_seasons = SUBSTRING(tv_seasons, 1, LENGTH(tv_seasons) -8)
WHERE type = 'TV Show' AND tv_seasons LIKE '%Seasons'
RETURNING title, duration, tv_seasons;

-- I converted the two new columns to integers:
ALTER TABLE netflix ALTER COLUMN movie_minutes SET DATA TYPE integer
USING movie_minutes::integer;

ALTER TABLE netflix ALTER COLUMN tv_seasons SET DATA TYPE integer
USING tv_seasons::integer;

-- I checked the data again: 
SELECT title, type, duration, movie_minutes, tv_seasons FROM netflix;

-- Happy with the changes, I hit commit:
COMMIT;