import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nrclex import NRCLex
import time


profile = pd.read_csv("data/profiles.csv")
anime = pd.read_csv("data/anime.csv")
animes = pd.read_csv("animes.csv")
animelist = pd.read_csv("animelist.csv")
rating = pd.read_csv("rating_complete.csv")
watchingStatus = pd.read_csv("watching_status.csv")
reviews = pd.read_csv("data/reviews.csv")
animeSynopsis = pd.read_csv("anime_with_synopsis.csv")


profile.shape, anime.shape, animes.shape, rating.shape,reviews.shape,animeSynopsis.shape

((81727, 4), (17562, 35), (19311, 12), (57633278, 3), (192112, 7), (16214, 5))


profile.head()


profile = profile.drop("link",1)


length = [len(fav) for fav in profile["favorites_anime"]]
profile["num_favorite"] = length


profile.to_csv("profiles.csv")


ratings = animelist["rating"]
cleaned_rating = [rating if rating != 0 else None for rating in ratings]
animelist["rating"] = cleaned_rating


cleaned_watching = [status if status in [1,2,3,4,6] else None for status in animelist["watching_status"]]
animelist["watching_status"] = cleaned_watching


animelist.head()


animelist = animelist.iloc[:21846732,:]


animelist.to_csv("animeRating.csv")


anime.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')


name = "Score-"
for i in range(1,11):
    colname = name + str(i)
    anime = anime.drop(colname,1)


anime.head()


year = [int(re.findall("[0-9]{4}", text)[0]) if len(re.findall("[0-9]{4}", text))>0 else None for text in anime.Aired]


year = pd.DataFrame(year)
year["Name"] = anime.Name
year["Episodes"] = anime.Episodes
year["Type"] = anime.Type
year = year.rename(columns = {0:"Year"})


index = [i if year.Episodes[i]!="Unknown" else 0 for i in range(year.shape[0])]
index.remove(0)
year = year.iloc[index,:]
year = year[year.Type!="Movie"]


year.head()


year.to_csv("year.csv")


anime.to_csv("anime.csv")


reviews = reviews.iloc[:84389,:]


reviews = reviews.drop("link",1)


reviews.text = reviews.text.str.lower()


overall = [int(re.findall("[0-9]+", score)[0]) for score in reviews.scores]
story = [int(re.findall("[0-9]+", score)[1]) for score in reviews.scores]
animation = [int(re.findall("[0-9]+", score)[2]) for score in reviews.scores]
sound = [int(re.findall("[0-9]+", score)[3]) for score in reviews.scores]
character = [int(re.findall("[0-9]+", score)[4]) for score in reviews.scores]
enjoyment = [int(re.findall("[0-9]+", score)[5]) for score in reviews.scores]


reviews["overall"] = overall
reviews["story"] = story
reviews["animation"] = animation 
reviews["sound"] = sound
reviews["character"] = character
reviews["enjoyment"] = enjoyment


start = time.time()
senti_list = []
for i in range(len(reviews.text)):
    senti_list.append(NRCLex(reviews.text[i]))
end = time.time()
print("The encoding process took ", end-start, " s.")

The encoding process took  655.4879891872406  s.


senti_detail = [senti_list[i].affect_frequencies for i in range(len(senti_list))]
fear = [senti_detail[i]["fear"] for i in range(len(senti_detail))]
anger = [senti_detail[i]["anger"] for i in range(len(senti_detail))]
anticip = [senti_detail[i]["anticip"] for i in range(len(senti_detail))]
trust = [senti_detail[i]["trust"] for i in range(len(senti_detail))]
surprise = [senti_detail[i]["surprise"] for i in range(len(senti_detail))]
positive = [senti_detail[i]["positive"] for i in range(len(senti_detail))]
negative = [senti_detail[i]["negative"] for i in range(len(senti_detail))]
sadness = [senti_detail[i]["sadness"] for i in range(len(senti_detail))]
disgust = [senti_detail[i]["disgust"] for i in range(len(senti_detail))]
joy = [senti_detail[i]["joy"] for i in range(len(senti_detail))]
reviews["fear"] = fear
reviews["anger"] = anger
reviews["anticip"] = anticip
reviews["trust"] = trust
reviews["surprise"] = surprise
reviews["positive"] = positive
reviews["negative"] = negative
reviews["sadness"] = sadness
reviews["disgust"] = disgust
reviews["joy"] = joy


reviews.head()


reviews = reviews.drop("scores", 1)


reviews.to_csv("reviews.csv")

	profile	gender	birthday	favorites_anime	link
0	DesolatePsyche	Male	Oct 2, 1994	['33352', '25013', '5530', '33674', '1482', '2...	https://myanimelist.net/profile/DesolatePsyche
1	baekbeans	Female	Nov 10, 2000	['11061', '31964', '853', '20583', '918', '925...	https://myanimelist.net/profile/baekbeans
2	skrn	NaN	NaN	['918', '2904', '11741', '17074', '23273', '32...	https://myanimelist.net/profile/skrn
3	edgewalker00	Male	Sep 5	['5680', '849', '2904', '3588', '37349']	https://myanimelist.net/profile/edgewalker00
4	aManOfCulture99	Male	Oct 30, 1999	['4181', '7791', '9617', '5680', '2167', '4382...	https://myanimelist.net/profile/aManOfCulture99

	anime_id	rating	watching_status	watched_episodes
0	67	9.0	1.0	1
1	6702	7.0	1.0	4
2	242	10.0	1.0	4
3	4898	NaN	1.0	1
4	21	10.0	1.0	0

	Unnamed: 0	MAL_ID	Name	Score	Genres	English name	Japanese name	Type	Episodes	Aired	...	Rating	Ranked	Popularity	Members	Favorites	Watching	Completed	On-Hold	Dropped	Plan to Watch
0	0	1	Cowboy Bebop	8.78	Action, Adventure, Comedy, Drama, Sci-Fi, Space	Cowboy Bebop	カウボーイビバップ	TV	26	Apr 3, 1998 to Apr 24, 1999	...	R - 17+ (violence & profanity)	28.0	39	1251960	61971	105808	718161	71513	26678	329800
1	1	5	Cowboy Bebop: Tengoku no Tobira	8.39	Action, Drama, Mystery, Sci-Fi, Space	Cowboy Bebop:The Movie	カウボーイビバップ天国の扉	Movie	1	Sep 1, 2001	...	R - 17+ (violence & profanity)	159.0	518	273145	1174	4143	208333	1935	770	57964
2	2	6	Trigun	8.24	Action, Sci-Fi, Adventure, Comedy, Drama, Shounen	Trigun	トライガン	TV	26	Apr 1, 1998 to Sep 30, 1998	...	PG-13 - Teens 13 or older	266.0	201	558913	12944	29113	343492	25465	13925	146918
3	3	7	Witch Hunter Robin	7.27	Action, Mystery, Police, Supernatural, Drama, ...	Witch Hunter Robin	Witch Hunter ROBIN (ウイッチハンターロビン)	TV	26	Jul 2, 2002 to Dec 24, 2002	...	PG-13 - Teens 13 or older	2481.0	1467	94683	587	4300	46165	5121	5378	33719
4	4	8	Bouken Ou Beet	6.98	Adventure, Fantasy, Shounen, Supernatural	Beet the Vandel Buster	冒険王ビィト	TV	52	Sep 30, 2004 to Sep 29, 2005	...	PG - Children	3710.0	4369	13224	18	642	7314	766	1108	3394

	Year	Name	Episodes	Type
2	1998.0	Trigun	26	TV
3	2002.0	Witch Hunter Robin	26	TV
4	2004.0	Bouken Ou Beet	52	TV
5	2005.0	Eyeshield 21	145	TV
6	2005.0	Hachimitsu to Clover	24	TV

	uid	profile	anime_uid	text	score	scores	fear	anger	trust	...	negative	sadness	disgust	joy	overall	story	animation	sound	character	enjoyment
0	255938	DesolatePsyche	34096	\n \n \n \n ...	8	{'Overall': '8', 'Story': '8', 'Animation': '8...	0.047619	0.054422	0.129252	...	0.136054	0.081633	0.054422	0.122449	8	8	8	10	9	8
1	259117	baekbeans	34599	\n \n \n \n ...	10	{'Overall': '10', 'Story': '10', 'Animation': ...	0.088235	0.029412	0.134454	...	0.105042	0.058824	0.021008	0.117647	10	10	10	10	10	10
2	253664	skrn	28891	\n \n \n \n ...	7	{'Overall': '7', 'Story': '7', 'Animation': '9...	0.028169	0.014085	0.225352	...	0.014085	0.042254	0.000000	0.183099	7	7	9	8	8	8
3	8254	edgewalker00	2904	\n \n \n \n ...	9	{'Overall': '9', 'Story': '9', 'Animation': '9...	0.056769	0.052402	0.122271	...	0.135371	0.074236	0.078603	0.096070	9	9	9	10	10	9
4	291149	aManOfCulture99	4181	\n \n \n \n ...	10	{'Overall': '10', 'Story': '10', 'Animation': ...	0.072727	0.036364	0.181818	...	0.054545	0.054545	0.018182	0.181818	10	10	8	9	10	10

Process the profile data¶

Handle missing values in the rating.csv file¶

For the dataset that stores information of various animes, we also want to delete unimportant columns¶

Finally we need to process the review.csv data¶