import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nrclex import NRCLex
import time
profile = pd.read_csv("data/profiles.csv")
anime = pd.read_csv("data/anime.csv")
animes = pd.read_csv("animes.csv")
animelist = pd.read_csv("animelist.csv")
rating = pd.read_csv("rating_complete.csv")
watchingStatus = pd.read_csv("watching_status.csv")
reviews = pd.read_csv("data/reviews.csv")
animeSynopsis = pd.read_csv("anime_with_synopsis.csv")
profile.shape, anime.shape, animes.shape, rating.shape,reviews.shape,animeSynopsis.shape
((81727, 4), (17562, 35), (19311, 12), (57633278, 3), (192112, 7), (16214, 5))
In this section we do two things to the profiles.csv, the first is to drop the link column which will not be used in this project, and the second is to add the "num_favorite" column which records the number of favorite anime of each user.
profile.head()
profile | gender | birthday | favorites_anime | link | |
---|---|---|---|---|---|
0 | DesolatePsyche | Male | Oct 2, 1994 | ['33352', '25013', '5530', '33674', '1482', '2... | https://myanimelist.net/profile/DesolatePsyche |
1 | baekbeans | Female | Nov 10, 2000 | ['11061', '31964', '853', '20583', '918', '925... | https://myanimelist.net/profile/baekbeans |
2 | skrn | NaN | NaN | ['918', '2904', '11741', '17074', '23273', '32... | https://myanimelist.net/profile/skrn |
3 | edgewalker00 | Male | Sep 5 | ['5680', '849', '2904', '3588', '37349'] | https://myanimelist.net/profile/edgewalker00 |
4 | aManOfCulture99 | Male | Oct 30, 1999 | ['4181', '7791', '9617', '5680', '2167', '4382... | https://myanimelist.net/profile/aManOfCulture99 |
profile = profile.drop("link",1)
length = [len(fav) for fav in profile["favorites_anime"]]
profile["num_favorite"] = length
profile.to_csv("profiles.csv")
As the website indicated, all entries with rating = 0 are actually missing values, so we have to impute them as None. Also, weird values that are not in the indicated range would also be treated as missing.
ratings = animelist["rating"]
cleaned_rating = [rating if rating != 0 else None for rating in ratings]
animelist["rating"] = cleaned_rating
cleaned_watching = [status if status in [1,2,3,4,6] else None for status in animelist["watching_status"]]
animelist["watching_status"] = cleaned_watching
animelist.head()
user_id | anime_id | rating | watching_status | watched_episodes | |
---|---|---|---|---|---|
0 | 0 | 67 | 9.0 | 1.0 | 1 |
1 | 0 | 6702 | 7.0 | 1.0 | 4 |
2 | 0 | 242 | 10.0 | 1.0 | 4 |
3 | 0 | 4898 | NaN | 1.0 | 1 |
4 | 0 | 21 | 10.0 | 1.0 | 0 |
animelist = animelist.iloc[:21846732,:]
animelist.to_csv("animeRating.csv")
anime.columns
Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped', 'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6', 'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'], dtype='object')
name = "Score-"
for i in range(1,11):
colname = name + str(i)
anime = anime.drop(colname,1)
anime.head()
Unnamed: 0 | MAL_ID | Name | Score | Genres | English name | Japanese name | Type | Episodes | Aired | ... | Rating | Ranked | Popularity | Members | Favorites | Watching | Completed | On-Hold | Dropped | Plan to Watch | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | Cowboy Bebop | 8.78 | Action, Adventure, Comedy, Drama, Sci-Fi, Space | Cowboy Bebop | カウボーイビバップ | TV | 26 | Apr 3, 1998 to Apr 24, 1999 | ... | R - 17+ (violence & profanity) | 28.0 | 39 | 1251960 | 61971 | 105808 | 718161 | 71513 | 26678 | 329800 |
1 | 1 | 5 | Cowboy Bebop: Tengoku no Tobira | 8.39 | Action, Drama, Mystery, Sci-Fi, Space | Cowboy Bebop:The Movie | カウボーイビバップ 天国の扉 | Movie | 1 | Sep 1, 2001 | ... | R - 17+ (violence & profanity) | 159.0 | 518 | 273145 | 1174 | 4143 | 208333 | 1935 | 770 | 57964 |
2 | 2 | 6 | Trigun | 8.24 | Action, Sci-Fi, Adventure, Comedy, Drama, Shounen | Trigun | トライガン | TV | 26 | Apr 1, 1998 to Sep 30, 1998 | ... | PG-13 - Teens 13 or older | 266.0 | 201 | 558913 | 12944 | 29113 | 343492 | 25465 | 13925 | 146918 |
3 | 3 | 7 | Witch Hunter Robin | 7.27 | Action, Mystery, Police, Supernatural, Drama, ... | Witch Hunter Robin | Witch Hunter ROBIN (ウイッチハンターロビン) | TV | 26 | Jul 2, 2002 to Dec 24, 2002 | ... | PG-13 - Teens 13 or older | 2481.0 | 1467 | 94683 | 587 | 4300 | 46165 | 5121 | 5378 | 33719 |
4 | 4 | 8 | Bouken Ou Beet | 6.98 | Adventure, Fantasy, Shounen, Supernatural | Beet the Vandel Buster | 冒険王ビィト | TV | 52 | Sep 30, 2004 to Sep 29, 2005 | ... | PG - Children | 3710.0 | 4369 | 13224 | 18 | 642 | 7314 | 766 | 1108 | 3394 |
5 rows × 26 columns
year = [int(re.findall("[0-9]{4}", text)[0]) if len(re.findall("[0-9]{4}", text))>0 else None for text in anime.Aired]
year = pd.DataFrame(year)
year["Name"] = anime.Name
year["Episodes"] = anime.Episodes
year["Type"] = anime.Type
year = year.rename(columns = {0:"Year"})
index = [i if year.Episodes[i]!="Unknown" else 0 for i in range(year.shape[0])]
index.remove(0)
year = year.iloc[index,:]
year = year[year.Type!="Movie"]
year.head()
Year | Name | Episodes | Type | |
---|---|---|---|---|
2 | 1998.0 | Trigun | 26 | TV |
3 | 2002.0 | Witch Hunter Robin | 26 | TV |
4 | 2004.0 | Bouken Ou Beet | 52 | TV |
5 | 2005.0 | Eyeshield 21 | 145 | TV |
6 | 2005.0 | Hachimitsu to Clover | 24 | TV |
year.to_csv("year.csv")
anime.to_csv("anime.csv")
Extract the sentiment scores of each review, and also flatten the score dictionary.
reviews = reviews.iloc[:84389,:]
reviews = reviews.drop("link",1)
reviews.text = reviews.text.str.lower()
overall = [int(re.findall("[0-9]+", score)[0]) for score in reviews.scores]
story = [int(re.findall("[0-9]+", score)[1]) for score in reviews.scores]
animation = [int(re.findall("[0-9]+", score)[2]) for score in reviews.scores]
sound = [int(re.findall("[0-9]+", score)[3]) for score in reviews.scores]
character = [int(re.findall("[0-9]+", score)[4]) for score in reviews.scores]
enjoyment = [int(re.findall("[0-9]+", score)[5]) for score in reviews.scores]
reviews["overall"] = overall
reviews["story"] = story
reviews["animation"] = animation
reviews["sound"] = sound
reviews["character"] = character
reviews["enjoyment"] = enjoyment
start = time.time()
senti_list = []
for i in range(len(reviews.text)):
senti_list.append(NRCLex(reviews.text[i]))
end = time.time()
print("The encoding process took ", end-start, " s.")
The encoding process took 655.4879891872406 s.
senti_detail = [senti_list[i].affect_frequencies for i in range(len(senti_list))]
fear = [senti_detail[i]["fear"] for i in range(len(senti_detail))]
anger = [senti_detail[i]["anger"] for i in range(len(senti_detail))]
anticip = [senti_detail[i]["anticip"] for i in range(len(senti_detail))]
trust = [senti_detail[i]["trust"] for i in range(len(senti_detail))]
surprise = [senti_detail[i]["surprise"] for i in range(len(senti_detail))]
positive = [senti_detail[i]["positive"] for i in range(len(senti_detail))]
negative = [senti_detail[i]["negative"] for i in range(len(senti_detail))]
sadness = [senti_detail[i]["sadness"] for i in range(len(senti_detail))]
disgust = [senti_detail[i]["disgust"] for i in range(len(senti_detail))]
joy = [senti_detail[i]["joy"] for i in range(len(senti_detail))]
reviews["fear"] = fear
reviews["anger"] = anger
reviews["anticip"] = anticip
reviews["trust"] = trust
reviews["surprise"] = surprise
reviews["positive"] = positive
reviews["negative"] = negative
reviews["sadness"] = sadness
reviews["disgust"] = disgust
reviews["joy"] = joy
reviews.head()
uid | profile | anime_uid | text | score | scores | fear | anger | anticip | trust | ... | negative | sadness | disgust | joy | overall | story | animation | sound | character | enjoyment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 255938 | DesolatePsyche | 34096 | \n \n \n \n ... | 8 | {'Overall': '8', 'Story': '8', 'Animation': '8... | 0.047619 | 0.054422 | 0.0 | 0.129252 | ... | 0.136054 | 0.081633 | 0.054422 | 0.122449 | 8 | 8 | 8 | 10 | 9 | 8 |
1 | 259117 | baekbeans | 34599 | \n \n \n \n ... | 10 | {'Overall': '10', 'Story': '10', 'Animation': ... | 0.088235 | 0.029412 | 0.0 | 0.134454 | ... | 0.105042 | 0.058824 | 0.021008 | 0.117647 | 10 | 10 | 10 | 10 | 10 | 10 |
2 | 253664 | skrn | 28891 | \n \n \n \n ... | 7 | {'Overall': '7', 'Story': '7', 'Animation': '9... | 0.028169 | 0.014085 | 0.0 | 0.225352 | ... | 0.014085 | 0.042254 | 0.000000 | 0.183099 | 7 | 7 | 9 | 8 | 8 | 8 |
3 | 8254 | edgewalker00 | 2904 | \n \n \n \n ... | 9 | {'Overall': '9', 'Story': '9', 'Animation': '9... | 0.056769 | 0.052402 | 0.0 | 0.122271 | ... | 0.135371 | 0.074236 | 0.078603 | 0.096070 | 9 | 9 | 9 | 10 | 10 | 9 |
4 | 291149 | aManOfCulture99 | 4181 | \n \n \n \n ... | 10 | {'Overall': '10', 'Story': '10', 'Animation': ... | 0.072727 | 0.036364 | 0.0 | 0.181818 | ... | 0.054545 | 0.054545 | 0.018182 | 0.181818 | 10 | 10 | 8 | 9 | 10 | 10 |
5 rows × 22 columns
reviews = reviews.drop("scores", 1)
reviews.to_csv("reviews.csv")