diff --git a/.gitattributes b/.gitattributes index c81953b78c0f443e9fbd7f285d390e07c7f6a7e8..bbe2e015f7181a03bbbb6d6cb856dec2f7ee54d0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,5 +7,6 @@ notebooks/FCA.ipynb !filter notebooks/Hamming.ipynb !filter notebooks/Weinbewertungen_Vivino.ipynb !filter notebooks/distances.ipynb !filter +notebooks/Video_Games.ipynb notebooks/wikipedia_language_editions.ipynb !filter notebooks/wikipedia_regex.ipynb !filter diff --git a/notebooks/Video_Games.ipynb b/notebooks/Video_Games.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cbf505d550e6299003020d703db78029a1aa68ea --- /dev/null +++ b/notebooks/Video_Games.ipynb @@ -0,0 +1,2412 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9afc5b3a", + "metadata": {}, + "source": [ + "<center>Institut für Bibliotheks- und Informationswissenschaft, Humboldt-Universität zu Berlin</center>\n", + "\n", + "<h1 align=\"center\">Modul Datenanalyse & -auswertung: Analysis of Video Games Sales Data</h1>\n", + "<h2 align=\"center\">Jan Raoul Weber</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "3526c728", + "metadata": {}, + "source": [ + "<h2>Table of Contents</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "406c3ffc", + "metadata": {}, + "source": [ + "1. Introduction\n", + "2. The dataset and its components\n", + "3. Data cleansing, NaN-values & descriptive analysis of the dataset\n", + "4. Relationships between variables\n", + "5. Answering the RQS / inductive analysis\n", + "6. Discussion\n", + "\n", + "References<br>\n", + "Appendix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07886533", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import thinkstats2 as ts2\n", + "import thinkplot as tp\n", + "import re\n", + "import scipy.stats as stats\n", + "import random\n", + "import statsmodels.formula.api as smf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58b603d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Originally an extra file called \"thinkstats_extra\"\n", + "class ParetoPdf(ts2.Pdf):\n", + " \"\"\"Represents the PDF of a Pareto distribution.\"\"\"\n", + "\n", + " def __init__(self, xm, alpha, high, label=None):\n", + " \"\"\"Constructs a Pareto Pdf with given parameter.\n", + "\n", + " xm: minimum value (scale parameter)\n", + " alpha: shape parameter\n", + " high: upper bound value\n", + " label: string\n", + " \"\"\"\n", + " self.xm = xm\n", + " self.alpha = alpha\n", + " self.high = high\n", + " self.label = label if label is not None else '_nolegend_'\n", + "\n", + " def __str__(self):\n", + " return 'ParetoPdf(%f, %f)' % (self.xm, self.alpha)\n", + "\n", + " def GetLinspace(self):\n", + " \"\"\"Get a linspace for plotting.\n", + "\n", + " Returns: numpy array\n", + " \"\"\"\n", + " return np.linspace(self.xm, self.high, 101)\n", + "\n", + " def Density(self, xs):\n", + " \"\"\"Evaluates this Pdf at xs.\n", + "\n", + " xs: scalar or sequence of floats\n", + "\n", + " returns: float or NumPy array of probability density\n", + " \"\"\"\n", + " return stats.pareto.pdf(xs, self.alpha, self.xm)\n", + "\n", + "def MakeNormalPlot1(weights):\n", + " \n", + " mean, var = ts2.TrimmedMeanVar(weights, p=0.01)\n", + " std = np.sqrt(var)\n", + "\n", + " xs = [-5, 5]\n", + " xs, ys = ts2.FitLine(xs, mean, std)\n", + " tp.Plot(xs, ys, color=\"0.8\", label=\"normal model\")\n", + "\n", + " xs, ys = ts2.NormalProbability(weights)\n", + " tp.Plot(xs, ys, label=\"NA Sales\")\n", + " \n", + "def MakeNormalPlot2(weights):\n", + " \n", + " mean, var = ts2.TrimmedMeanVar(weights, p=0.01)\n", + " std = np.sqrt(var)\n", + "\n", + " xs = [-5, 5]\n", + " xs, ys = ts2.FitLine(xs, mean, std)\n", + " tp.Plot(xs, ys, color=\"0.8\", label=\"normal model\")\n", + "\n", + " xs, ys = ts2.NormalProbability(weights)\n", + " tp.Plot(xs, ys, label=\"NA Sales (log10)\")\n", + " \n", + "def ExpoResiduals(xs, ys, inter, slope):\n", + " xs = np.asarray(xs)\n", + " ys = np.asarray(ys)\n", + " res = ys - 10**(inter + slope * xs)\n", + " return res\n", + "\n", + "def ci(x,y):\n", + " r, p = stats.spearmanr(x, y)\n", + " r_z = np.arctanh(r)\n", + " alpha = 0.05\n", + " se = 1/np.sqrt(x.size-3)\n", + " z = stats.norm.ppf(1-alpha/2)\n", + " lo_z, hi_z = r_z-z*se, r_z+z*se\n", + " lo, hi = np.tanh((lo_z, hi_z))\n", + " return lo, hi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d20611a", + "metadata": {}, + "outputs": [], + "source": [ + "plt.rcParams[\"figure.figsize\"] = (16,9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "914d44d2", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('video_games_sales_2016.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "b2c4a73f", + "metadata": {}, + "source": [ + "<h2>1. Introduction</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "28f82cde", + "metadata": {}, + "source": [ + "Since the first ever commercial video game \"Computer Space\" came out more than fifty years ago ([Wardrip-Fruin\n", + "2021](#Wardrip-Fruin2021)), the video games industry has grown into a 155.89 billion US-Dollar market in the year 2020, expecting to grow to 268.81 in 2025 ([statista & Juniper Research 2021](#globalmarketvalue20202025)).\n", + "\n", + "In the wake of the Covid-19 pandemic and the consequential lockdown ocurring in numerous countries ([Dunford et al. 2020](#Dunford2020)), video games became even more popular, sending platform and video game sales surging ([GamesIndustry.biz, 2020](#GamesIndustry2020)). Consequently, the amount of time spent on video games globally within each generational group increased (Gen Z: 23\\%, Millenials: 21\\%, Gen X: 13\\%, Baby Boomer: 11\\%) ([Globalwebindex 2020, p.32](#GWI2020)). Accompanying it, the money spent on video games soared as well, increasing by 39\\% globally ([Simon-Kucher & Partners 2020](#SimonKucher2020)). Even though the size of growth is likely to decrease again, the market is still expected to increase substantially in the coming years ([pwc 2021, p.165](#pwc2021); [Simon-Kucher & Partners 2020](#SimonKucher2020); [Wijman 2020](#Wijman2020)).\n", + "\n", + "It was therefore deemed worthwhile to examine the sales of video games across different regions of the world to find out what video games are sold where and how the regional markets differ regarding preferences for e.g. Publishers." + ] + }, + { + "cell_type": "markdown", + "id": "c39fd075", + "metadata": {}, + "source": [ + "<h2 class = \"anchor\" id = \"c2\">2. The Dataset and its components</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "c24c2beb", + "metadata": {}, + "source": [ + "The original dataset of video game sales data was created by [Gregory Smith](https://www.kaggle.com/datasets/gregorut/videogamesales), containing the scraped *Totals Software Charts* from [vgchartz.com](https://www.vgchartz.com/gamedb/). VGChartz is a business intelligence and research company that publishes video game charts and video game hardware estimates every week ([VGChartz 2022](#vgchartz2022)). However, VGChartz had to face criticism due to its methods being opaque and some numbers being only estimates without an actual solid underlying set of data ([Carless 2008](#Carless2008), [Kohler 2008](#Kohler2008)). Despite these shortcomings, it is still the most comprehensive publicly available source of video games sales data.\n", + "\n", + "The dataset was then enriched by [Rush Kirubi](https://www.kaggle.com/datasets/rush4ratio/video-game-sales-with-ratings), by adding information like rating scores from the [metacritic-website](https://www.metacritic.com/). Metacritic publishes Meta-scores for movies and video games based on scores given by media outlets, but also lets users score movies and video games themselves ([Metacritic 2020](#Metacritic2022a))." + ] + }, + { + "cell_type": "markdown", + "id": "7be4ea26", + "metadata": {}, + "source": [ + "The original dataset \"video_game_sales_2016.csv\" contains 16 variables.\n", + "To work within the scope of this term paper, the number of variables contained in the dataset had to be reduced.\n", + "The focus of the analysis is on the sales in between different regions and the relationship between Publishers, Genre and Sales and therefore, the following variables were dropped:\n", + "\n", + "* Year_of_Release\n", + " - Time analysis is not the focus of this analysis and it is therefore dropped for brevity reasons.\n", + "* User_Counts / Critic_Counts\n", + " - To know how many people voted / contributed to the final scores was deemed not relevant for the analysis.\n", + "* Developer\n", + " - Developer of the video game is much less known and visible to possible consumers than publisher and was therefore dropped for brevity reasons.\n", + "* Platform\n", + " - Very inconsistent variable due to a new platform appearing every few years. Therefore it's explanatory power is constrained by time and is therefore not a variable that makes sense using for an analysis that spans from the mid 80s till today. Therefore the decision was made to go give the more \"consistent\" variables more space for analysis and drop the platform-variable.\n", + "* Rating\n", + " - Dropped for brevity reasons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81147375", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop([\"User_Count\", \"Critic_Count\", \"Developer\", \"Rating\", \"Year_of_Release\", \"Platform\"], axis = 1, inplace = True)" + ] + }, + { + "cell_type": "markdown", + "id": "f73d0217", + "metadata": {}, + "source": [ + "Following, the variables are presented and background information is given, if necessary.\n", + "If the variables are of a similar kind, they are subsumed under one heading (e.g. all \"Sales\" variables).\n", + "\n", + "From this point on, written text will be displayed in blue boxes to make it easily differentiable from code and output." + ] + }, + { + "cell_type": "markdown", + "id": "baedc6f2", + "metadata": {}, + "source": [ + "<h3> 2.1 Name (nominal)</h3>\n", + "<div class = 'alert alert-info'>\n", + "The name the video game was released under.\n", + " </div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccd4dc95", + "metadata": {}, + "outputs": [], + "source": [ + "print('The attribute \"Name\" has', df['Name'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "dc0bc9a9", + "metadata": {}, + "source": [ + "<h3> 2.2 Genre (nominal)</h3>\n", + "<div class = 'alert alert-info'>\n", + "The genre, the video game belongs to. A genre describes the type or style of a video game that can be identified by certain inherent characteristics (<a href = \"#Oxford2022\">Oxford Learner's Dicitionaries 2022</a>). Classic examples would be Action, Adventure, Role-playing, Sports or Racing games (<a href=\"#mirillis2017\">Mirillis 2017)</a>.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65be5d4a", + "metadata": {}, + "outputs": [], + "source": [ + "print('The attribute \"Genre\" has', df['Genre'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "6e280cbd", + "metadata": {}, + "source": [ + "<h3> 2.3 Publisher (nominal)</h3>\n", + "<div class = 'alert alert-info'>\n", + "The name of the publisher of each video game release. Publishers are usually big companies or parent organisations that finance the development of a video game (<a href = \"#Zegarra2020\">Zegarra 2020</a>).\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77346467", + "metadata": {}, + "outputs": [], + "source": [ + "print('The attribute \"Publisher\" has', df['Publisher'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "36fe1eb0", + "metadata": {}, + "source": [ + "<h3> 2.4 Sales (all numerical)</h3>\n", + "<div class = 'alert alert-info'>\n", + "All sales are in millions of units.\n", + " <ul>\n", + " <li>Global = Global Sales</li>\n", + " <li>NA (North America) = Canada, Mexico, USA</li>\n", + " <li>EU = European Union</li>\n", + " <li>JP = Japan</li>\n", + " <li>Other = Australia, Asia (excluding Japan), South America, Europe (excluding the EU) and Africa</li>\n", + " </ul>\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "536ec214", + "metadata": {}, + "outputs": [], + "source": [ + "print('\"Global_Sales\" has', df['Global_Sales'].nunique(), 'manifestations')\n", + "print('\"NA_Sales\" has', df['NA_Sales'].nunique(), 'manifestations')\n", + "print('\"EU_Sales\" has', df['EU_Sales'].nunique(), 'manifestations')\n", + "print('\"JP_Sales\" has', df['JP_Sales'].nunique(), 'manifestations')\n", + "print('\"Other_Sales\" has', df['Other_Sales'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "7559b437", + "metadata": {}, + "source": [ + "<h3> 2.5 Scores</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "e364b019", + "metadata": {}, + "source": [ + "<h4>2.5.1 Critic_Score (numerical)</h4>\n", + "<div class = 'alert alert-info'>\n", + "The score (out of 100) given to games by the metacritic's <a href = \"https://www.metacritic.com/about-metascores\">Metascore System</a>.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4445f8f", + "metadata": {}, + "outputs": [], + "source": [ + "print('The attribute \"Critic_Score\" has', df['Critic_Score'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "90943ad5", + "metadata": {}, + "source": [ + "<h4>2.5.2 User_Score (numerical)</h4>\n", + "<div class = 'alert alert-info'>\n", + "Cumulated score of ratings given to video games by metacritic's users (out of 10).\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92285def", + "metadata": {}, + "outputs": [], + "source": [ + "print('The attribute \"User_Score\" has', df['User_Score'].nunique(), 'manifestations')" + ] + }, + { + "cell_type": "markdown", + "id": "b0735c74", + "metadata": {}, + "source": [ + "<h2>3. Data cleansing, NaN-values & Descriptive Analysis of the Dataset</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "d10c1c55", + "metadata": {}, + "source": [ + "<h3>3.1 Data cleansing</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "30d1eb29", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Before starting the descriptive analysis of each variable, first their data-type is checked for any abnormalities:\n", + "</div> " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59274dc8", + "metadata": {}, + "outputs": [], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "c2e00790", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " <i>Name, Genre,</i> and <i> Publisher</i> are all correctly stored as <i>string</i> (in pandas <a href = \"https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html\"><i>object</i></a>, respectively).\n", + " <i>NA_/EU_/JP_/Other_Sales and Critic_Score</i> are all correctly stored as floating-point-numbers.<br>\n", + " <br>\n", + "The only variable that shows abnormalities, is <i>User_Score</i> as an object.\n", + "</div> " + ] + }, + { + "cell_type": "markdown", + "id": "613258f7", + "metadata": {}, + "source": [ + "<h4>3.1.1 User_Score</h4>" + ] + }, + { + "cell_type": "markdown", + "id": "74e68072", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " The string-value <i>tbd</i> (to be determined) contained in <i>User_Score</i> is responsible for turning the variable into an object-datatype when it should actually be a float-datatype to be practical for analysis.\n", + "About its frequency in the context of the variable as such, it can be said that:\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1d9a180", + "metadata": {}, + "outputs": [], + "source": [ + "num_tbd = len(df.loc[df['User_Score'] == 'tbd'])\n", + "print('The total number of tbd values is:', num_tbd, 'out of', len(df['User_Score']))\n", + "print('The ratio of tbd-values is therefore in \"user_score\" is:', round(((num_tbd / len(df['User_Score'])) * 100), 2 ), \n", + " '%')" + ] + }, + { + "cell_type": "markdown", + "id": "ffb5aecd", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "14.5% is a considerable amount. However, due to them making the variable not analyseable, a separate variable is created. The possibility of giving 'tbd' a proxy-value like 20 was discarded, because it would distort the analysis (e.g. the mean).\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01da5bfc", + "metadata": {}, + "outputs": [], + "source": [ + "user_score_cleansed = df['User_Score'].replace('tbd', np.nan)\n", + "user_score_cleansed = pd.to_numeric(user_score_cleansed)\n", + "user_score_cleansed" + ] + }, + { + "cell_type": "markdown", + "id": "2012e7db", + "metadata": {}, + "source": [ + "<h3>3.2 Analysis of Nan-values</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "7bd92c58", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Even though NaN-values are always dropped in further analysis, the ratio of NaN-values is checked, because it is an important aspect of the later interpretation of the results:\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d78a1d5e", + "metadata": {}, + "outputs": [], + "source": [ + "for x, y in df.iteritems():\n", + " null = df[x].isnull()\n", + " num_null = len(null.loc[null == True])\n", + " print('NaN-values ratio for', '\"' + str(x) + '\":', round(((num_null / len(null)) * 100), 2 ), '%')" + ] + }, + { + "cell_type": "markdown", + "id": "79334945", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " <i>Critic_Score</i> and <i>User_Score</i> have a high amount of NaN-values, not including the \"tbd\" - values in User_Score. This is due to some of the games not having a matching entry on <a href = \"https://www.metacritic.com/\">metacritic.com</a>, as explained by <a href = \"https://www.kaggle.com/datasets/rush4ratio/video-game-sales-with-ratings\">Rush Kirubi</a>. This is unfortunate and will make the results less meaningful, but there is no way of getting score results for them any other way and this diminishment of applicational power of the results has to be accepted." + ] + }, + { + "cell_type": "markdown", + "id": "dfc725a8", + "metadata": {}, + "source": [ + "<h3>3.3 Descriptive Analysis</h3>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c739746", + "metadata": {}, + "outputs": [], + "source": [ + "name = df['Name'].dropna()\n", + "genre = df['Genre'].dropna()\n", + "publisher = df['Publisher'].dropna()\n", + "na_s = df['NA_Sales'].dropna()\n", + "eu_s = df['EU_Sales'].dropna()\n", + "jp_s = df['JP_Sales'].dropna()\n", + "other_s = df['Other_Sales'].dropna()\n", + "global_s = df['Global_Sales'].dropna()\n", + "critic_score = df['Critic_Score'].dropna()\n", + "user_score = user_score_cleansed.dropna() # The variable already created in 4.1.2 is used" + ] + }, + { + "cell_type": "markdown", + "id": "0185b8bb", + "metadata": {}, + "source": [ + "<h4>3.3.1 Name</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d813cda4", + "metadata": {}, + "outputs": [], + "source": [ + "print('A short overview of summary statistics:')\n", + "print('')\n", + "\n", + "print(name.describe())\n", + "print('')\n", + "\n", + "print('There are', name.describe()[1], 'different games in the dataset.')\n", + "print('The most represented game is:', \n", + " '\"' + name.value_counts().index.tolist()[0] + '\"', \n", + " 'with', name.value_counts()[0], 'appearances.')\n", + "print('The least represented game is:', \n", + " '\"' + name.value_counts().index.tolist()[11558] + '\"' ,\n", + " 'with', name.value_counts()[11558], 'appearance.')\n", + "print('There are', len(df) - df['Name'].nunique(), \n", + " 'more records in the dataset than there are unique names.')\n", + "names_v_c = name.value_counts()\n", + "names_moo = names_v_c.loc[names_v_c > 1]\n", + "print('Overall, there are', len(names_moo), \n", + " 'video games that have been published on more than one platform.')\n", + "print('Of the',len(names_moo), 'video games, the most were published on', \n", + " names_moo.mode()[0], \n", + " 'platforms',\n", + " '(' + str(len(names_v_c.loc[names_v_c == 2])) + ').')" + ] + }, + { + "cell_type": "markdown", + "id": "232f1034", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Some video games appear more than once in the dataset due to multiple platform releases, leading to more records than unique names.\n", + "</div> " + ] + }, + { + "cell_type": "markdown", + "id": "35dce7e1", + "metadata": {}, + "source": [ + "<h4>3.3.2 Genre</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b94b1dd6", + "metadata": {}, + "outputs": [], + "source": [ + "print('A short overview of summary statistics:')\n", + "print('')\n", + "\n", + "print(genre.describe())\n", + "print('')\n", + "\n", + "print('There are', genre.describe()[1], 'different genres in the dataset.')\n", + "print('The most represented genre is:', \n", + " '\"' + genre.value_counts().index.tolist()[0] + '\"', \n", + " 'with', genre.value_counts()[0], 'appearances.')\n", + "print('The least represented genre is:', \n", + " '\"' + genre.value_counts().index.tolist()[11] + '\"' ,\n", + " 'with', genre.value_counts()[11], 'appearances.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dec3a6df", + "metadata": {}, + "outputs": [], + "source": [ + "genre_hist = ts2.Hist(genre)\n", + "hd = genre_hist.GetDict()\n", + "plt.bar(hd.keys(), hd.values(), color = '#9400d3')\n", + "plt.title('Genre frequency', fontsize = 18)\n", + "plt.xlabel(\"Genre Name\", fontsize = 14)\n", + "plt.ylabel('Number of games released', fontsize = 14)\n", + "for x,y in zip(hd.keys(),hd.values()):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "79da8fbc", + "metadata": {}, + "source": [ + "<h4>3.3.3 Publisher</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "126dae8d", + "metadata": {}, + "outputs": [], + "source": [ + "print('A short overview of summary statistics:')\n", + "print('')\n", + "\n", + "print(publisher.describe())\n", + "print('')\n", + "\n", + "print('The most frequent publisher is:', \n", + " '\"' + str(publisher.value_counts().index.tolist()[0]) + '\"', \n", + " 'with', publisher.value_counts()[0], 'appearances.')\n", + "print('The least frequent publisher is:', \n", + " '\"' + str(publisher.value_counts().index.tolist()[580]) + '\"',\n", + " 'with', publisher.value_counts()[580], 'appearances.')" + ] + }, + { + "cell_type": "markdown", + "id": "c171569a", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Due to the large number of different publishers (581), only the top 20 are visualised:\n", + "</div> " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b1c6dd6", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers_name = publisher.value_counts().index.tolist()[:20]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeac6204", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers = publisher.loc[publisher.isin(top20_publishers_name) ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24809c29", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers_hist = ts2.Hist(top20_publishers, label = 'Publisher')\n", + "hd = top20_publishers_hist.GetDict()\n", + "plt.bar(hd.keys(), hd.values(), color = '#9400d3')\n", + "plt.title('Publisher frequency', fontsize = 18)\n", + "plt.xlabel(\"Publisher Name\", fontsize = 14)\n", + "plt.ylabel('Number of games released', fontsize = 14)\n", + "for x,y in zip(hd.keys(),hd.values()):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "plt.xticks(rotation = 90)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "5aadce8f", + "metadata": {}, + "source": [ + "<h4>3.3.4 Sales </h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e5dd2c", + "metadata": {}, + "outputs": [], + "source": [ + "print('Global mean:', round(global_s.mean(),3))\n", + "print('Highest sale number:', global_s.max(), 'million')\n", + "print('Lowest sale number:', global_s.min(), 'million')\n", + "print('Standard deviation:', global_s.std())\n", + "print('')\n", + "\n", + "print('North America mean:', round(na_s.mean(),3))\n", + "print('Highest sale number:', na_s.max(), 'million')\n", + "print('Lowest sale numbber:', na_s.min(), 'million')\n", + "print('Standard deviation:', na_s.std())\n", + "print('')\n", + "\n", + "print('European Union mean:', round(eu_s.mean(),3))\n", + "print('Highest sale number:', eu_s.max(), 'million')\n", + "print('Lowest sale numbber:', eu_s.min(), 'million')\n", + "print('Standard deviation:', eu_s.std())\n", + "print('')\n", + "\n", + "print('Japan mean:', round(jp_s.mean(),3))\n", + "print('Highest sale number:', jp_s.max(), 'million')\n", + "print('Lowest sale number:', jp_s.min(), 'million')\n", + "print('Standard deviation:', jp_s.std())\n", + "print('')\n", + "\n", + "print('Other mean:', round(other_s.mean(),3))\n", + "print('Highes sale number:', other_s.max(), 'million')\n", + "print('Lowest sale numbber:', other_s.min(), 'million')\n", + "print('Standard deviation:', other_s.std())" + ] + }, + { + "cell_type": "markdown", + "id": "09936213", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "The first thing to notice is that there are games that have recorded sale numbers that appear to be 0, meaning, the sales number was so small that it could not be represented in the statistic.\n", + "<br>\n", + " <br>\n", + "To get an overall feel of the distributions of the different sales - variables, histograms, normal probability plots and a comparative CDF are created:\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5af7c92f", + "metadata": {}, + "outputs": [], + "source": [ + "global_s_hist = ts2.Hist(global_s)\n", + "na_s_hist = ts2.Hist(na_s)\n", + "eu_s_hist = ts2.Hist(eu_s)\n", + "jp_s_hist = ts2.Hist(jp_s)\n", + "other_s_hist = ts2.Hist(other_s)\n", + "\n", + "#Global\n", + "plt.subplot(5, 2, 1) \n", + "plt.hist(global_s_hist, bins = 500, width=0.1, color = 'black', label = 'Global')\n", + "plt.title('Global sales')\n", + "plt.ylabel('Frequency')\n", + "\n", + "plt.subplot(5, 2, 2) \n", + "MakeNormalPlot1(global_s)\n", + "tp.Config(ylabel = 'Sales (in million)', title = 'Global Sales (Normal Probability Plot)')\n", + "\n", + "# NA\n", + "plt.subplot(5, 2, 3) \n", + "plt.hist(na_s_hist, bins=100, width=0.2, color = 'violet', label = 'North America Sales')\n", + "plt.title('North America sales')\n", + "plt.ylabel('Frequency')\n", + "\n", + "plt.subplot(5, 2, 4) \n", + "MakeNormalPlot1(na_s)\n", + "tp.Config(ylabel = 'Sales (in million)', title = 'North America Sales (Normal Probability Plot)')\n", + "\n", + "#EU\n", + "plt.subplot(5, 2, 5) \n", + "plt.hist(eu_s_hist, bins=100, width=0.2, color = 'blue', label = 'EU Sales')\n", + "plt.title('EU sales')\n", + "plt.ylabel('Frequency')\n", + "\n", + "plt.subplot(5, 2, 6) \n", + "MakeNormalPlot1(eu_s)\n", + "tp.Config(ylabel = 'Sales (in million)', title = 'EU sales')\n", + "\n", + "\n", + "\n", + "#Japan\n", + "plt.subplot(5, 2, 7)\n", + "plt.hist(jp_s_hist, bins = 50, width=0.1, color = 'red', label = 'Japan sales')\n", + "plt.title('Japan sales')\n", + "plt.ylabel('Frequency')\n", + "\n", + "plt.subplot(5, 2, 8) \n", + "MakeNormalPlot1(jp_s)\n", + "tp.Config(ylabel = 'Sales (in million)', title = 'Japan sales')\n", + "\n", + "#other\n", + "plt.subplot(5, 2, 9) \n", + "plt.hist(other_s_hist, bins = 80, width=0.1, color = 'yellow', label = 'Other sales')\n", + "plt.title('Other sales')\n", + "plt.xlabel('Sales (in million)')\n", + "plt.ylabel('Frequency')\n", + "\n", + "plt.subplot(5, 2, 10) \n", + "MakeNormalPlot1(other_s)\n", + "tp.Config(xlabel = 'z', ylabel = 'Sales (in million)', title = 'Other sales')\n", + "\n", + "plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=1, wspace=0.4, hspace=0.8)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0573d9c", + "metadata": {}, + "outputs": [], + "source": [ + "# variables\n", + "na_s_cdf = ts2.Cdf(na_s)\n", + "eu_s_cdf = ts2.Cdf(eu_s)\n", + "jp_s_cdf = ts2.Cdf(jp_s)\n", + "other_s_cdf = ts2.Cdf(other_s)\n", + "global_s_cdf = ts2.Cdf(global_s)\n", + "\n", + "# pareto model (Global Sales is taken as a proxy-variable to create the pareto model)\n", + "pdf_pareto = ParetoPdf(xm = 0, alpha = 5, high = 100, label = 'pareto model (Global Sales)')\n", + "cdf_pareto = ts2.Cdf(pdf_pareto)\n", + "\n", + "# exponential model\n", + "lam = 2\n", + "pdf_expo = ts2.ExponentialPdf(lam, label='exponential model')\n", + "cdf_expo = ts2.Cdf(pdf_expo)\n", + "\n", + "\n", + "tp.Cdf(na_s_cdf, label = 'North America Sales', color = 'violet')\n", + "tp.Cdf(eu_s_cdf, label = 'European Union Sales', color = 'blue')\n", + "tp.Cdf(jp_s_cdf, label = 'Japan Sales', color = 'red')\n", + "tp.Cdf(other_s_cdf, label = 'Other Sales', color = 'yellow')\n", + "tp.Cdf(global_s_cdf, label = 'Global Sales', color = 'black')\n", + "tp.Cdf(cdf_pareto, label ='pareto model')\n", + "tp.Cdf(cdf_expo, label = 'exponential model')\n", + "plt.title('CDF of all sales variables compared to pareto and exponential', fontsize = 18)\n", + "tp.Config(xlabel = 'Sales (in million)', ylabel = 'CDF')" + ] + }, + { + "cell_type": "markdown", + "id": "e801ba7a", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "The histograms, the normal probability plots and the comparative cdf show that the sales - variables are not normally distributed, but rather have the shape of an exponential or even pareto distribution.<br>\n", + "An educated guess would be that they are lognormally distributed, which will be investigated further by using <i>Global Sales</i> as a proxy-variable.\n", + "</div> " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33236fef", + "metadata": {}, + "outputs": [], + "source": [ + "log_global_s = np.log(global_s) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b5d915e", + "metadata": {}, + "outputs": [], + "source": [ + "log_global_s_cdf = ts2.Cdf(log_global_s)\n", + "\n", + "log_norm = np.random.normal(loc = log_global_s.mean(), scale = log_global_s.std(), \n", + " size = len(log_global_s))\n", + "log_cdf_norm = ts2.Cdf(log_norm, label = 'sample')\n", + "\n", + "\n", + "tp.Cdf(log_global_s_cdf, label = 'Global sales (ln)', color = 'black')\n", + "tp.Cdf(log_cdf_norm, label = 'normal model')\n", + "plt.title('CDF Global Sales (ln) vs. normal distribution model', fontsize = 18)\n", + "tp.Config(xlabel = 'Sales in million (ln)', ylabel = 'CDF')" + ] + }, + { + "cell_type": "markdown", + "id": "880a9256", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "The variable seems to be roughly lognormally distributed.\n", + "To confirm this we also take a look at their PDFs using the Kernel Density Estimation (KDE):\n", + " </div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41002d88", + "metadata": {}, + "outputs": [], + "source": [ + "mean = log_global_s.mean()\n", + "std = log_global_s.std()\n", + "\n", + "normal_pdf = ts2.NormalPdf(mean, std)\n", + "\n", + "log_estimated_pdf = ts2.EstimatedPdf(log_global_s)\n", + "tp.Pdf(normal_pdf, label = 'normal model')\n", + "tp.Pdf(log_estimated_pdf, label = 'Global Sales (ln)', color = 'black')\n", + "tp.Config(xlabel = 'Sales (in million (ln))', ylabel = 'PDF')\n", + "\n", + "print('The Fisher-Pearson coefficient of skewness:', stats.skew(log_global_s))\n", + "print('Its p-value:', stats.skewtest(log_global_s)[1])" + ] + }, + { + "cell_type": "markdown", + "id": "4d07b5b5", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Concluding, we can say that the Global Sales - variable, even though it is statistically significantly skewed to the right, is roughly lognormally distributed.<br>\n", + "Due to all of the variables having a rather similarly shaped cdf. For brevity's sake, we can conclude that the other variables are also lognormally distributed.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "e83ee52e", + "metadata": {}, + "source": [ + "<h4>3.3.5 Scores (User/Critic)</h4>" + ] + }, + { + "cell_type": "markdown", + "id": "ee6670bd", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " To standardise the two scores , the values of the <i>Critic_Score</i> - variable are divided by 10 to turn it into an \"out of 10\" score.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10c072eb", + "metadata": {}, + "outputs": [], + "source": [ + "critic_score = critic_score / 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b52d99d", + "metadata": {}, + "outputs": [], + "source": [ + "print('A short overview of summary statistics:')\n", + "print('')\n", + "\n", + "print('Mean critic score:', round(critic_score.mean(),2))\n", + "print('Highest critic score:', critic_score.max())\n", + "print('Lowest critic score:', critic_score.min())\n", + "print('Standard deviation:', critic_score.std())\n", + "print('')\n", + "\n", + "print('Mean user score:', round(user_score.mean(),2))\n", + "print('Highest user score:', user_score.max())\n", + "print('Lowest user score:', user_score.min())\n", + "print('Standard deviation:', user_score.std())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c3570d4", + "metadata": {}, + "outputs": [], + "source": [ + "critic_score_cdf = ts2.Cdf(critic_score)\n", + "user_score_cdf = ts2.Cdf(user_score)\n", + "norm = np.random.normal(loc = critic_score.mean(), scale = critic_score.std(), size = len(critic_score))\n", + "cdf_norm = ts2.Cdf(norm, label = 'Normal Distribution Model')\n", + "\n", + "fig, axs = plt.subplots(1, figsize=(17, 7), sharey=False, tight_layout=True)\n", + "tp.Plot(critic_score_cdf, color = '#9400d3', label = 'critic score')\n", + "tp.Plot(cdf_norm, label = 'normal model')\n", + "tp.Config(xlabel = 'Scores', ylabel = 'CDF')\n", + "\n", + "norm = np.random.normal(loc = user_score.mean(), scale = user_score.std(), size = len(user_score))\n", + "cdf_norm = ts2.Cdf(norm, label = 'sample')\n", + "\n", + "fig, axs = plt.subplots(1, figsize=(17, 7), sharey=False, tight_layout=True)\n", + "tp.Plot(user_score_cdf,color = 'green', label = 'user score')\n", + "tp.Plot(cdf_norm, label = 'normal model')\n", + "tp.Config(xlabel = 'Scores', ylabel = 'CDF')" + ] + }, + { + "cell_type": "markdown", + "id": "8658349c", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "The variables seem to be roughly normally distributed.\n", + "For better visualisation, we also take a look at their PDFs using the Kernel Density Estimation (KDE):\n", + " </div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72b21f37", + "metadata": {}, + "outputs": [], + "source": [ + "mean = critic_score.mean()\n", + "std = critic_score.std()\n", + "\n", + "normal_pdf = ts2.NormalPdf(mean, std)\n", + "\n", + "estimated_pdf = ts2.EstimatedPdf(critic_score)\n", + "tp.Pdf(normal_pdf, label = 'normal model')\n", + "tp.Pdf(estimated_pdf, label = 'Critic Score', color = '#9400d3')\n", + "\n", + "tp.Config(xlabel = 'Score', ylabel = 'PDF')\n", + "\n", + "print('The Fisher-Pearson coefficient of skewness:', stats.skew(critic_score))\n", + "print('Its p-value:', stats.skewtest(critic_score)[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27fbadfb", + "metadata": {}, + "outputs": [], + "source": [ + "mean = user_score.mean()\n", + "std = user_score.std()\n", + "\n", + "normal_pdf = ts2.NormalPdf(mean, std)\n", + "\n", + "estimated_pdf = ts2.EstimatedPdf(user_score)\n", + "tp.Pdf(normal_pdf, label = 'normal model')\n", + "tp.Pdf(estimated_pdf, label = 'User Score', color = 'green')\n", + "\n", + "tp.Config(xlabel = 'Score', ylabel = 'PDF')\n", + "\n", + "print('The Fisher-Pearson coefficient of skewness:', stats.skew(user_score))\n", + "print('Its p-value:', stats.skewtest(user_score)[1])" + ] + }, + { + "cell_type": "markdown", + "id": "505c4689", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Overall, both variables are roughly normally distributed. However, both are statistically significantly skewed to the left, meaning that both users and critics rated the games better than a normal distribution would suggest. The users even more than the critics with a <i>Fisher-Pearson coefficient of skewness</i> that is around twice as large than that of the critic's score.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "60841ac4", + "metadata": {}, + "source": [ + "<h2 id = 'c4'>4. Relationships between variables</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "e1261718", + "metadata": {}, + "source": [ + "<h3>4.1 Name - Sales</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "1554284b", + "metadata": {}, + "source": [ + "<div class = \"alert alert-info\">\n", + "In this section, the relationship between the Name-variable and the Sales-variables in the different regions and globally, is investigated. Phrased differently, we want to know, which games were sold the most in each region. To keep the analysis to a comprehensible size, only the top 5 games from each region are investigated.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44be223a", + "metadata": {}, + "outputs": [], + "source": [ + "df_name_sales = df.dropna(subset = ['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales',\n", + " 'Other_Sales', 'Global_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe55bd9e", + "metadata": {}, + "outputs": [], + "source": [ + "msg_global = df_name_sales[['Name', 'Global_Sales']].groupby(['Name'], \n", + " as_index=False).mean().sort_values(by='Global_Sales', \n", + " ascending=False).round(2).head(5)\n", + "msg_na = df_name_sales[['Name', 'NA_Sales']].groupby(['Name'], \n", + " as_index=False).mean().sort_values(by='NA_Sales', \n", + " ascending=False).round(2).head(5)\n", + "msg_eu = df_name_sales[['Name', 'EU_Sales']].groupby(['Name'], \n", + " as_index=False).mean().sort_values(by='EU_Sales', \n", + " ascending=False).round(2).head(5)\n", + "msg_jp = df_name_sales[['Name', 'JP_Sales']].groupby(['Name'], \n", + " as_index=False).mean().sort_values(by='JP_Sales', \n", + " ascending=False).round(2).head(5)\n", + "msg_other = df_name_sales[['Name', 'Other_Sales']].groupby(['Name'], \n", + " as_index=False).mean().sort_values(by='Other_Sales', \n", + " ascending=False).round(2).head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baf96529", + "metadata": {}, + "outputs": [], + "source": [ + "plt.subplot(3, 2, 1) \n", + "plt.bar(msg_global['Name'], msg_global['Global_Sales'], color = 'black')\n", + "plt.xticks(rotation = 90)\n", + "plt.title('Top 5 global', fontsize = 18)\n", + "plt.ylabel('Sales (in million)', fontsize = 14)\n", + "for x,y in zip(msg_global['Name'],msg_global['Global_Sales']):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center') \n", + " \n", + "\n", + "plt.subplot(3, 2, 2)\n", + "plt.bar(msg_na['Name'], msg_na['NA_Sales'], color = 'violet')\n", + "plt.xticks(rotation = 90)\n", + "plt.title('Top 5 games NA', fontsize = 18)\n", + "plt.ylabel('Sales (in million)', fontsize = 14)\n", + "for x,y in zip(msg_na['Name'],msg_na['NA_Sales']):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "\n", + "plt.subplot(3, 2, 3)\n", + "plt.bar(msg_eu['Name'], msg_eu['EU_Sales'], color = 'blue')\n", + "plt.xticks(rotation = 90)\n", + "plt.title('Top 5 games EU', fontsize = 18)\n", + "plt.ylabel('Sales (in million)', fontsize = 14)\n", + "for x,y in zip(msg_eu['Name'],msg_eu['EU_Sales']):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "\n", + "plt.subplot(3, 2, 4)\n", + "plt.bar(msg_jp['Name'], msg_jp['JP_Sales'], color = 'red')\n", + "plt.xticks(rotation = 90)\n", + "plt.title('Top 5 games JP', fontsize = 18)\n", + "plt.ylabel('Sales (in million)', fontsize = 14)\n", + "for x,y in zip(msg_jp['Name'],msg_jp['JP_Sales']):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "\n", + "plt.subplot(3, 2, 5)\n", + "plt.bar(msg_other['Name'], msg_other['Other_Sales'], color = '#ffea00')\n", + "plt.xticks(rotation = 90)\n", + "plt.title('Top 5 games Other', fontsize = 18)\n", + "plt.ylabel('Sales (in million)', fontsize = 14)\n", + "for x,y in zip(msg_other['Name'],msg_other['Other_Sales']):\n", + " label = \"{:.2f}\".format(y)\n", + " plt.annotate(label,\n", + " (x,y), \n", + " textcoords=\"offset points\", \n", + " xytext=(0,10), \n", + " ha='center')\n", + "plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=4.0, wspace=0.4, hspace=0.8)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "92a61f06", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " Apart from Japan, there is a large overlap between the most sold games in the different regions (e.g. \"Wii Resort\", \"Mario Kart Wii\"). However there is no game that is in the top 5 of most sold games in every region. Part of the reason is that the sales in Japan are heavily dominated by Pokemon-Games, which do not seem to be as popular in other regions.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "a6e54420", + "metadata": {}, + "source": [ + "<h3>4.2 Genre - Sales</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "dcae0ace", + "metadata": {}, + "source": [ + "<div class = \"alert alert-info\">\n", + "In this section, the relationship between the Genre-variable and the Sales-variables is investigated. Phrased differently, we want to know, the games of which genre were sold the most in each region.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39ed133c", + "metadata": {}, + "outputs": [], + "source": [ + "df_genre_sales = df.dropna(subset = ['Name', 'Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales',\n", + " 'Other_Sales', 'Global_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "247a8ad1", + "metadata": {}, + "outputs": [], + "source": [ + "df_genre_sales_grouped = df_genre_sales.groupby('Genre')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46b4dc93", + "metadata": {}, + "outputs": [], + "source": [ + "spg_global = df_genre_sales_grouped['Global_Sales'].agg([np.sum]).reset_index()\n", + "spg_na = df_genre_sales_grouped['NA_Sales'].agg([np.sum]).reset_index()\n", + "spg_eu = df_genre_sales_grouped['EU_Sales'].agg([np.sum]).reset_index()\n", + "spg_jp = df_genre_sales_grouped['JP_Sales'].agg([np.sum]).reset_index()\n", + "spg_other = df_genre_sales_grouped['Other_Sales'].agg([np.sum]).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f42f7729", + "metadata": {}, + "outputs": [], + "source": [ + "spg_global.rename(columns = {'sum':'Global'}, inplace = True)\n", + "spg_na.rename(columns = {'sum':'NA_Sales'}, inplace = True)\n", + "spg_eu.rename(columns = {'sum':'EU_Sales'}, inplace = True)\n", + "spg_jp.rename(columns = {'sum':'JP_Sales'}, inplace = True)\n", + "spg_other.rename(columns = {'sum':'Other_Sales'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0a57009", + "metadata": {}, + "outputs": [], + "source": [ + "spg_total = pd.merge(left=spg_global,\n", + " right=spg_na,how=\"outer\", on=\"Genre\")\n", + "\n", + "spg_total = pd.merge(left=spg_total,\n", + " right=spg_eu,how=\"outer\", on=\"Genre\")\n", + "spg_total = pd.merge(left=spg_total,\n", + " right=spg_jp,how=\"outer\", on=\"Genre\")\n", + "spg_total = pd.merge(left=spg_total,\n", + " right=spg_other,how=\"outer\", on=\"Genre\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb62523", + "metadata": {}, + "outputs": [], + "source": [ + "spg_global.plot(x='Genre',ylabel = 'Sales (in million)', kind = 'bar', title = 'Genre - Sales global', color = 'black')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa8dc1f6", + "metadata": {}, + "outputs": [], + "source": [ + "spg_no_global = spg_total.drop('Global', axis = 'columns')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2219bbd", + "metadata": {}, + "outputs": [], + "source": [ + "spg_no_global.plot( x = 'Genre', kind = 'bar', ylabel = 'Sales (in million)', \n", + " color = ['violet', 'blue', 'red', '#ffea00'])" + ] + }, + { + "cell_type": "markdown", + "id": "6e2c2b1e", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Apart from Japan, all regions follow a similar trend of sales across the different genres. In every region apart from Japan, games of the \"Action\" - Genre are the most popular. In Japan, it is \"Role-Playing\". It is the only genre in which the Sales in North America are overtaken by any other region of the world. A possible explanation would be the <i>Pokemon - Games</i>, which dominate the Japanese Sales, and are categorised as <i>Role-Playing</i> Games (see below).\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34fb1fa9", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.loc[df['Name'] == 'Pokemon Red/Pokemon Blue', 'Genre'])\n", + "print(df.loc[df['Name'] == 'Pokemon Gold/Pokemon Silver', 'Genre'])\n", + "print(df.loc[df['Name'] == 'Pokemon Diamond/Pokemon Pearl', 'Genre'])\n", + "print(df.loc[df['Name'] == 'Pokemon Black/Pokemon White', 'Genre'])" + ] + }, + { + "cell_type": "markdown", + "id": "28bbcf38", + "metadata": {}, + "source": [ + "<h3>4.3 Publisher - Sales</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "c5816738", + "metadata": {}, + "source": [ + "<div class = \"alert alert-info\">\n", + "In this section, the relationship between the Publisher-variable and the Sales-variables is investigated. Phrased differently, we want to know, how many games of which publisher were sold in each region. As in 3.3.3, only the top 20 publishers are used.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5dca58c", + "metadata": {}, + "outputs": [], + "source": [ + "df_publisher_sales = df.dropna(subset = ['Name', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales',\n", + " 'Other_Sales', 'Global_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e25ac68", + "metadata": {}, + "outputs": [], + "source": [ + "df_ps_grouped = df_publisher_sales.groupby('Publisher')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63c39343", + "metadata": {}, + "outputs": [], + "source": [ + "spp_global = df_ps_grouped['Global_Sales'].agg([np.sum]).reset_index()\n", + "spp_na = df_ps_grouped['NA_Sales'].agg([np.sum]).reset_index()\n", + "spp_eu = df_ps_grouped['EU_Sales'].agg([np.sum]).reset_index()\n", + "spp_jp = df_ps_grouped['JP_Sales'].agg([np.sum]).reset_index()\n", + "spp_other = df_ps_grouped['Other_Sales'].agg([np.sum]).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09d96a1b", + "metadata": {}, + "outputs": [], + "source": [ + "spp_global.rename(columns = {'sum':'Global'}, inplace = True)\n", + "spp_na.rename(columns = {'sum':'NA_Sales'}, inplace = True)\n", + "spp_eu.rename(columns = {'sum':'EU_Sales'}, inplace = True)\n", + "spp_jp.rename(columns = {'sum':'JP_Sales'}, inplace = True)\n", + "spp_other.rename(columns = {'sum':'Other_Sales'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb35e822", + "metadata": {}, + "outputs": [], + "source": [ + "spp_total = pd.merge(left=spp_global,\n", + " right=spp_na,how=\"outer\", on=\"Publisher\")\n", + "\n", + "spp_total = pd.merge(left=spp_total,\n", + " right=spp_eu,how=\"outer\", on=\"Publisher\")\n", + "spp_total = pd.merge(left=spp_total,\n", + " right=spp_jp,how=\"outer\", on=\"Publisher\")\n", + "spp_total = pd.merge(left=spp_total,\n", + " right=spp_other,how=\"outer\", on=\"Publisher\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bd2cbc8", + "metadata": {}, + "outputs": [], + "source": [ + "spp_global.sort_values('Global', ascending = False).head(20).plot(x='Publisher', ylabel = 'Sales (in million)',\n", + " kind = 'bar', title = 'Genre - Sales global', \n", + " color = 'black')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf1d0664", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publisher_global = spp_global.sort_values('Global', ascending = False).head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd3379da", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publisher_global_name = top20_publisher_global['Publisher']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1745b7d2", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers = spp_total.loc[spp_total['Publisher'].isin(top20_publisher_global_name)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f912f02d", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers_no_global = top20_publishers.drop(['Global'], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87977345", + "metadata": {}, + "outputs": [], + "source": [ + "top20_publishers_no_global.plot( x = 'Publisher', kind = 'bar', \n", + " color = ['violet', 'blue', 'red', '#ffea00'])" + ] + }, + { + "cell_type": "markdown", + "id": "bb7a27ea", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " It is interesting to see that the <i>Global / EU / Other - Sales</i> follow a similar pattern, while the <i>Japan - Sales</i> seem to overproportionally favour other publishers (like \"Namco Bandai Games\" or \"Konami Digital Entertainment\"), meaning that there is a somewhat regional preference for certain publishers. One extreme manifestation are the Nintendo / Electronic Arts - sales, which are quite similar in the EU, while being vastly different in Japan in Japan." + ] + }, + { + "cell_type": "markdown", + "id": "08f8fa07", + "metadata": {}, + "source": [ + "<h3>4.4 Sales - sales</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "261c0d3a", + "metadata": {}, + "source": [ + "<div class = \"alert alert-info\">\n", + "In this section, the relationship between the Sales-variables with each other is investigated. Phrased differently, we want to know if, in general, the same games are sold in the same frequency in each region (forming a linear shape in the scatterplot) or if they differ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1d04a64", + "metadata": {}, + "outputs": [], + "source": [ + "df_sales_regions = df.dropna(subset = ['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales',\n", + " 'Other_Sales'])\n", + "na_sales = df_sales_regions['NA_Sales']\n", + "eu_sales = df_sales_regions['EU_Sales']\n", + "jp_sales = df_sales_regions['JP_Sales']\n", + "other_sales = df_sales_regions['Other_Sales']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd077c3c", + "metadata": {}, + "outputs": [], + "source": [ + "i, s = ts2.LeastSquares(na_sales, eu_sales)\n", + "i2, s2 = ts2.LeastSquares(na_sales, jp_sales)\n", + "i3, s3 = ts2.LeastSquares(na_sales, other_sales)\n", + "\n", + "fx, fy = ts2.FitLine(na_sales, i, s)\n", + "fx2, fy2 = ts2.FitLine(na_sales, i2, s2)\n", + "fx3, fy3 = ts2.FitLine(na_sales, i3, s3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58d7bdcf", + "metadata": {}, + "outputs": [], + "source": [ + "# x = NA_Sales\n", + "\n", + "print('To avoid repetition, the x-axis of each column was only labelled at the bottom of each column')\n", + "plt.subplot(2, 3, 1)\n", + "plt.title('North America - EU')\n", + "tp.scatter(na_sales, eu_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx, fy, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'EU_Sales')\n", + "\n", + "plt.subplot(2, 3, 2)\n", + "plt.title('North America - Japan')\n", + "tp.Scatter(na_sales, jp_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx2, fy2, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'JP_Sales')\n", + "\n", + "plt.subplot(2, 3, 3)\n", + "plt.title('North America - Other')\n", + "tp.Scatter(na_sales, other_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx3, fy3, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'Other_Sales')\n", + "\n", + "\n", + "plt.subplot(2, 3, 4)\n", + "tp.HexBin(na_sales, eu_sales, alpha = 1)\n", + "tp.Config(xlabel = 'NA_Sales', ylabel = 'EU_Sales')\n", + "\n", + "plt.subplot(2, 3, 5)\n", + "tp.HexBin(na_sales, jp_sales, alpha = 1)\n", + "tp.Config(xlabel = 'NA_Sales', ylabel = 'JP_Sales')\n", + "\n", + "plt.subplot(2, 3, 6)\n", + "tp.HexBin(na_sales, other_sales, alpha = 1)\n", + "tp.Config(xlabel = 'NA_Sales', ylabel = 'Other_Sales')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f87d43a", + "metadata": {}, + "outputs": [], + "source": [ + "# x = EU-Sales\n", + "\n", + "i4, s4 = ts2.LeastSquares(eu_sales, jp_sales)\n", + "i5, s5 = ts2.LeastSquares(eu_sales, other_sales)\n", + "\n", + "fx4, fy4 = ts2.FitLine(eu_sales, i4, s4)\n", + "fx5, fy5 = ts2.FitLine(eu_sales, i5, s5)\n", + "\n", + "plt.subplot(2, 2, 1)\n", + "plt.title('EU - Japan')\n", + "tp.Scatter(eu_sales, jp_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx4, fy4, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'JP_Sales')\n", + "\n", + "plt.subplot(2, 2, 2)\n", + "plt.title('EU - Other')\n", + "tp.Scatter(eu_sales, other_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx5, fy5, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'Other_Sales')\n", + "\n", + "plt.subplot(2, 2, 3)\n", + "tp.HexBin(eu_sales, jp_sales, alpha = 1)\n", + "tp.Config(xlabel ='EU_Sales', ylabel = 'JP_Sales')\n", + "\n", + "plt.subplot(2, 2, 4)\n", + "tp.HexBin(eu_sales, other_sales, alpha = 1)\n", + "tp.Config(xlabel = 'EU_Sales', ylabel = 'Other_Sales')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95683e25", + "metadata": {}, + "outputs": [], + "source": [ + "# x = JP_Sales\n", + "\n", + "i6, s6 = ts2.LeastSquares(jp_sales, other_sales)\n", + "fx6, fy6 = ts2.FitLine(jp_sales, i6, s6)\n", + "\n", + "plt.subplot(2, 1, 1)\n", + "plt.title('JP - Other')\n", + "tp.Scatter(jp_sales, other_sales, alpha = 0.1, color = '#9400d3')\n", + "tp.Plot(fx6, fy6, color = 'yellow', linewidth = 2)\n", + "tp.Config(ylabel = 'Other_Sales')\n", + "\n", + "plt.subplot(2, 1, 2)\n", + "tp.HexBin(jp_sales, other_sales, alpha = 1)\n", + "tp.Config(xlabel ='JP_Sales', ylabel = 'Other_Sales')" + ] + }, + { + "cell_type": "markdown", + "id": "bb61c62b", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " Overall, it can be said that most video games are sold in small amounts in every region, which is exemplified by the HexBin-plots showing only a small blue dot in the bottom left corner indicating a high density of data. All other datapoints are so scarce that they are not visible in the HexBin-plot.\n", + "At first sight, the <i>EU</i>, <i>NA</i> and <i>Other</i> - variables seem to be more correlated with each other (forming a straight-line like shape), while when set in relation to the <i>JP</i> - variable, the values are more strongly accumulated along the edges of the plot, meaning that single games are sold much more in one region than the other.<br>\n", + "A more formal investigation into their correlation will be undertaken in the next chapter.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "013f6b57", + "metadata": {}, + "source": [ + "<h3>4.5 User / Critic Score - Sales</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "8106056f", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " To keep the analysis in the scope of this term paper, it is assumed that the User / Critic - scores do not affect the Sales differently in each region and therefore only the Global Sales variable is used for analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32f1305e", + "metadata": {}, + "outputs": [], + "source": [ + "df_ucs = df\n", + "df_ucs.replace('tbd', np.nan, inplace = True)\n", + "df_ucs = df.dropna(subset = ['User_Score', 'Critic_Score', 'Global_Sales'])\n", + "\n", + "ucs_gs = df_ucs['Global_Sales']\n", + "ucs_gs_log = np.log(df_ucs['Global_Sales'])\n", + "ucs_user_score = pd.to_numeric(df_ucs['User_Score'])\n", + "ucs_critic_score = df_ucs['Critic_Score'] / 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2d979bd", + "metadata": {}, + "outputs": [], + "source": [ + "i7, s7 = ts2.LeastSquares(ucs_user_score, ucs_gs_log)\n", + "i8, s8 = ts2.LeastSquares(ucs_critic_score, ucs_gs_log,)\n", + "fx7, fy7 = ts2.FitLine(jp_sales, i7, s7)\n", + "fx8, fy8 = ts2.FitLine(ucs_critic_score, i8, s8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "506c0e05", + "metadata": {}, + "outputs": [], + "source": [ + "# User_Score\n", + "plt.subplot(2, 1, 1)\n", + "plt.title('User Score - Global Sales')\n", + "tp.Scatter(ucs_user_score, ucs_gs, color = 'black')\n", + "tp.Plot(fx7, fy7, color = 'yellow', linewidth = 2)\n", + "tp.Config(xlabel = 'User Score', ylabel = 'Global Sales')\n", + "\n", + "# Critic_Score\n", + "plt.subplot(2, 1, 2)\n", + "plt.title('Critic Score - Global Sales')\n", + "tp.Scatter(ucs_critic_score, ucs_gs, color = 'black')\n", + "tp.Plot(fx8, fy8, color = 'yellow', linewidth = 2)\n", + "tp.Config(xlabel = 'Critic Score', ylabel = 'Global Sales')\n", + "\n", + "plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=1.0, wspace=0.4, hspace=0.8)" + ] + }, + { + "cell_type": "markdown", + "id": "e1f45820", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Both variables look rather similar, meaning that games with a higher score are, in tendency, sold more. The actual correlation and further analysis will be done in the next chapter.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "c9d1908c", + "metadata": {}, + "source": [ + "________________________________________________________________________________________________________________\n", + "\n", + "<div class = 'alert alert-info'>\n", + "<br>\n", + " \n", + "Considering the results of the descriptive analysis, the following Research Questions form the basis of the following inductive analysis:\n", + "\n", + "<b>RQ1: Are there differences in sales between the regions?</b>\n", + " <br>\n", + " <ul>\n", + " <li> H<sub>1.0</sub>: There are no correlations between sales in the different regions.</li>\n", + " <li> H<sub>1.1</sub>: All regional sales variables correlate strongly and statistically significantly with one another, meaning the same games have the same popularity in each of the regions.</li>\n", + " <li>H<sub>1.2</sub>: There are differences in the correlations, meaning that some games are sold more in one region than the other.</li>\n", + " </ul>\n", + " \n", + "<b>RQ2: Do User & Critic score correlate with Global Sales and which Score - variable explains Global Sales better?</b>\n", + " <br>\n", + " <ul>\n", + " <li>H<sub>2.0</sub>: None of them correlate statistically significantly with the <i>Global Sales</i> variables.</li>\n", + " <li>H<sub>2.1</sub>: The User Score variable has a stronger statistically significant correlation and explains more of the variation of Global Sales than Critic Score.</li>\n", + " <li>H<sub>2.2</sub>: The Critic Score variable has a stronger statistically significant correlation and explains more of the variation of Global Sales than User Score.</li>\n", + " </ul>\n", + "\n", + "<b>RQ3: Based on the results of RQ2 which of the following variables explains the sales globally and in Japan better?</b> <br>(Because of the different behaviour of the <i>JP_Sales</i> - variable in the descriptive analysis, it is investigated individually, the <i>Global Sales</i> - variable can be seen as a \"Proxy-Variable\" for the other sales - variables. Investigating each individually would exceed the scope of this analysis)\n", + " <ul>\n", + " <li>Platform</li>\n", + " <li>Genre</li>\n", + " </ul>\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "d4acf8af", + "metadata": {}, + "source": [ + "<h2 id = 'c5'>5. Answering the Research Questions / Inductive Analysis</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "918a2343", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "Before starting the inductive analysis, it has to be pointed out that when doing a \"linear regression\" with logged data, we are not actually fitting a straight line to the data, but an exponential function.\n", + " <br>\n", + " <br>\n", + " Another fact to point out, is that regarding the regression analysis, only for <i>User Score</i> and <i>Critic Score</i>, the slopes will be examined. Due to the myriad of different manifestation, including it in the other categorical variables' analyses is not possible. The intercept is not reported, because it was judged to not contain any meaningful information, especially with the data being logged.\n", + "<br>\n", + "<br>\n", + "Also, the level of statistical significance assumed is: $p < 0.01$.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "104e788d", + "metadata": {}, + "source": [ + "<h3>5.1 RQ1: Differences in sales between regions</h3>" + ] + }, + { + "cell_type": "markdown", + "id": "3f1e52d8", + "metadata": {}, + "source": [ + "<div class = 'class alert alert-info'>\n", + "Using the lognormal-version of the data, which is roughly normally distributed, the pearson correlation coefficient should yield interpretable results. However, to check, <i>Spearman's r</i> is also calculated for comparison.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "405aa2e4", + "metadata": {}, + "outputs": [], + "source": [ + "df_sales_or_nn = df_sales_regions[['NA_Sales', 'EU_Sales', 'JP_Sales', \n", + " 'Other_Sales']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51468990", + "metadata": {}, + "outputs": [], + "source": [ + "# To prevent getting \"-inf\", only rows containing no 0 values are selected\n", + "df_sales_or_nn = df_sales_or_nn[(df_sales_or_nn['NA_Sales'] > 0) \n", + " & (df_sales_or_nn['EU_Sales'] > 0) \n", + " & (df_sales_or_nn['JP_Sales'] > 0)\n", + " & (df_sales_or_nn['Other_Sales'] > 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e458f31e", + "metadata": {}, + "outputs": [], + "source": [ + "df_sales_or_nn_log = np.log(df_sales_or_nn)\n", + "na_sales_log = df_sales_or_nn_log['NA_Sales']\n", + "eu_sales_log = df_sales_or_nn_log['EU_Sales']\n", + "jp_sales_log = df_sales_or_nn_log['JP_Sales']\n", + "other_sales_log = df_sales_or_nn_log['Other_Sales']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cf89b77", + "metadata": {}, + "outputs": [], + "source": [ + "sales_corr_matrix_pearson_log = df_sales_or_nn_log.corr(method = 'pearson')\n", + "sns.set(font_scale=1.5)\n", + "sns.heatmap(sales_corr_matrix_pearson_log, annot = True).set(title = 'Correlation matrix (Pearson)') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0738c803", + "metadata": {}, + "outputs": [], + "source": [ + "sales_corr_matrix_spearman_log = df_sales_or_nn_log.corr(method = 'spearman')\n", + "sns.heatmap(sales_corr_matrix_spearman_log, annot = True).set(title = 'Correlation matrix (Spearman)') " + ] + }, + { + "cell_type": "markdown", + "id": "306020dc", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "The observable differences between the pearson and spearman correlation coefficient show that even though the data is roughly lognormally distributed, the results still differ quite a bit. Therefore, for further analysis, the spearman correlation coefficient as a <a href = \"https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html?highlight=spearman\">non-parametric measure that does not assume that both datasets are normally distributed</a>, is used.\n", + "</div>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dedd80c8", + "metadata": {}, + "outputs": [], + "source": [ + "# 95 % Confidence Intervals of correlation\n", + "print('95% CI NA - EU:', ci(na_sales_log, eu_sales_log))\n", + "print('95% CI NA - JP:', ci(na_sales_log, jp_sales_log))\n", + "print('95% CI NA - Other:', ci(na_sales_log, other_sales_log))\n", + "print('95% CI EU - JP:', ci(eu_sales_log, jp_sales_log))\n", + "print('95% CI EU - Other:', ci(eu_sales_log, other_sales_log))\n", + "print('95% CI JP - Other:', ci(jp_sales_log, other_sales_log))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c07fe252", + "metadata": {}, + "outputs": [], + "source": [ + "# p-values\n", + "print('p-value (NA - EU):', stats.spearmanr(na_sales_log, eu_sales_log)[1])\n", + "print('p-value (NA - JP):', stats.spearmanr(na_sales_log, jp_sales_log)[1])\n", + "print('p-value (NA - Other):', stats.spearmanr(na_sales_log, other_sales_log)[1])\n", + "print('p-value (EU - JP):', stats.spearmanr(eu_sales_log, jp_sales_log)[1])\n", + "print('p-value (EU - Other):', stats.spearmanr(eu_sales_log, other_sales_log)[1])\n", + "print('p-value (JP - Other):', stats.spearmanr(jp_sales_log, other_sales_log)[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74da38b4", + "metadata": {}, + "outputs": [], + "source": [ + "# R-squared of linear regression\n", + "res_naeu = stats.linregress(na_sales_log, eu_sales_log)\n", + "res_naot = stats.linregress(na_sales_log, other_sales_log)\n", + "res_euot = stats.linregress(eu_sales_log, other_sales_log)\n", + "\n", + "print('R-squared of NA_Sales and EU_Sales:', res_naeu.rvalue ** 2)\n", + "print('R-squared of NA_Sales and Other_Sales:', res_naot.rvalue ** 2)\n", + "print('R-squared of EU_Sales and Other_Sales:', res_euot.rvalue ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bc78af3", + "metadata": {}, + "outputs": [], + "source": [ + "for x, y in df_sales_or_nn_log.iteritems():\n", + " if x == 'JP_Sales':\n", + " continue\n", + " else:\n", + " res = stats.linregress(df_sales_or_nn_log[x], jp_sales_log)\n", + " print('R-squared of', x, 'and JP_Sales:', res.rvalue ** 2)" + ] + }, + { + "cell_type": "markdown", + "id": "08d048a5", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "It can be said that all regional sales are statistically significantly correlated, except for sales in Japan and Other, which's p-value is above the threshold of 0.01, but is still quite low ($p < 0.5$).<br>\n", + " We can therefore reject H<sub>1.0</sub>.\n", + "</div> " + ] + }, + { + "cell_type": "markdown", + "id": "ecf97f03", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "However, the much lower correlation coefficients of Japanese sales and sales in all other regions may be an indicator that the japanese market behaves differently to the others.<br>\n", + "This finding is supported by the R-squared values of the non-Japanese regions being very high, meaning that sales in one region explain the sales in another reason quite well. For Japanese sales, however, this does not hold true, with the highest R-squared value being only as high as ~ 5.8.\n", + " \n", + "<br> \n", + "<br>\n", + "We can therefore reject H<sub>1.1</sub> and accept H<sub>1.2</sub>.\n", + "<br>\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "a2c87b77", + "metadata": {}, + "source": [ + "<h3>5.2 RQ2: User/Critic score - Globals sales correlation & regression</h3>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3225189d", + "metadata": {}, + "outputs": [], + "source": [ + "df_ucsa = df\n", + "df_ucsa.replace('tbd', np.nan, inplace = True)\n", + "df_ucsa = df_ucsa.dropna(subset = ['User_Score', 'Critic_Score', 'Global_Sales'])\n", + "\n", + "ucs_gs = df_ucsa['Global_Sales']\n", + "ucs_gs_log = np.log(df_ucsa['Global_Sales'])\n", + "ucs_us = pd.to_numeric(df_ucsa['User_Score'])\n", + "ucs_cs= df_ucsa['Critic_Score'] / 10 # to turn into \"1 out of 10\" - units" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42bd1c63", + "metadata": {}, + "outputs": [], + "source": [ + "# User Score\n", + "print(\"Spearman's r:\", stats.spearmanr(ucs_us, ucs_gs_log)[0])\n", + "print(\"p-value:\", stats.spearmanr(ucs_us, ucs_gs_log)[1])\n", + "res = stats.linregress(ucs_us, ucs_gs_log)\n", + "print(\"R-squared:\", res.rvalue **2)\n", + "print(\"Slope:\", res.slope)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c44b2655", + "metadata": {}, + "outputs": [], + "source": [ + "# Critic Score\n", + "print(\"Spearman's r:\", stats.spearmanr(ucs_cs, ucs_gs_log)[0])\n", + "print(\"p-value:\", stats.spearmanr(ucs_cs, ucs_gs_log)[1])\n", + "res = stats.linregress(ucs_cs, ucs_gs_log)\n", + "print(\"R-squared:\", res.rvalue **2)\n", + "print(\"Slope:\", res.slope)" + ] + }, + { + "cell_type": "markdown", + "id": "d406ea76", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " Both Score variables are statistically signifcantly correlated to <i>Global_Sales</i> ($p < 0.01$).\n", + " We can therefore reject H<sub>2.0</sub>\n", + "However, the correlation between <i>Critic_Score</i> and <i>Global_Sales</i> is more than twice as strong as between <i>User_Score</i> and <i>Global_Sales</i>.\n", + "The slope of the linear regression is a bit harder to interpret, because the sales variable are logged. However, by using the following formula: <i>(exp(slope) – 1) * 100</i>, we can approximate that with a one unit increase of User Score, Global Sales increases by ~ 39 %. With a one unit increase of Critic Score, Global Sales increases by ~43%.<br>\n", + " Subsequently, the R<sup>2</sup>-value of <i>Critic Score</i> is more than four times as large than that of User_Score, meaning that the <i>Critic Score</i> - variable holds more explanatory power over the <i>Global_Sales</i> - values than the <i>User Score</i> - variable.<br>\n", + " \n", + "\n", + "We can therefore accept H<sub>2.2</sub> and reject H<sub>2.1</sub>.\n", + "</div>" + ] + }, + { + "cell_type": "markdown", + "id": "a38bbf3c", + "metadata": {}, + "source": [ + "<h3>5.3 RQ3: Explanatory power of Genre and Publisher</h3>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2872617", + "metadata": {}, + "outputs": [], + "source": [ + "df_reg = df\n", + "# To prevent getting \"-inf\", only rows containing no 0 values are selected\n", + "df_reg = df_reg.loc[df_reg['JP_Sales'] > 0]\n", + "df_reg['Global_Sales_log'] = np.log(df_reg['Global_Sales'])\n", + "df_reg['JP_Sales_log'] = np.log(df_reg['JP_Sales'])\n", + "df_reg.dropna(inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "227558cd", + "metadata": {}, + "source": [ + "<h4>5.2.1 Explanatory power of <i>Genre</i></h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322e08bc", + "metadata": {}, + "outputs": [], + "source": [ + "# Global\n", + "formula_global_genre = ('Global_Sales_log ~ Genre')\n", + "results_global_genre = smf.ols(formula_global_genre, data=df_reg).fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1308258", + "metadata": {}, + "outputs": [], + "source": [ + "results_global_genre.rsquared" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1fc6fa1", + "metadata": {}, + "outputs": [], + "source": [ + "# Japan\n", + "formula_jp_genre = ('JP_Sales_log ~ Genre')\n", + "results_jp_genre = smf.ols(formula_jp_genre, data=df_reg).fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ce3e965", + "metadata": {}, + "outputs": [], + "source": [ + "results_jp_genre.rsquared" + ] + }, + { + "cell_type": "markdown", + "id": "b1273bf4", + "metadata": {}, + "source": [ + "<h4>5.2.2 Explanatory power of <i>Publisher</i></h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d30dfe60", + "metadata": {}, + "outputs": [], + "source": [ + "# Global\n", + "formula_global_publisher = ('Global_Sales_log ~ Publisher')\n", + "results_global_publisher = smf.ols(formula_global_publisher, data=df_reg).fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63f010be", + "metadata": {}, + "outputs": [], + "source": [ + "results_global_publisher.rsquared" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b9ad4b4", + "metadata": {}, + "outputs": [], + "source": [ + "# Japan\n", + "formula_jp_publisher = ('JP_Sales_log ~ Publisher')\n", + "results_jp_publisher = smf.ols(formula_jp_publisher, data=df_reg).fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "538b95a9", + "metadata": {}, + "outputs": [], + "source": [ + "results_jp_publisher.rsquared" + ] + }, + { + "cell_type": "markdown", + "id": "acabfe47", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + " Overall, the most explanatory power is held by the <i>Publisher</i> variable, followed by <i>Genre</i>. While R-squared is quite similar regarding <i>Genre</i> between the regions, it differs more greatly regarding <i>Publisher</i>, suggesting that there is a bigger \"regional effect\" of <i>Publisher</i> than of <i>Genre</i>. \n", + " </div>" + ] + }, + { + "cell_type": "markdown", + "id": "3c877d86", + "metadata": {}, + "source": [ + "<h2>6. Discussion</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "e25cb9a3", + "metadata": {}, + "source": [ + "<div class = 'alert alert-info'>\n", + "In this short analysis of video games sales data, first, it was stated, why it is of interest to examine video games sales data. Second, each variable was examined and described. Third, a descriptive analysis of each variable was carried out. Fourth, Relationships between variables were explored and based on the results, three research questions were formulated. In the fifth and final part, these research questions were investigated.\n", + "A short summary is given here:\n", + " <br>\n", + " \n", + "<ul>\n", + " <li>For <b>RQ1</b>, <b><i>H<sub>1.2</sub></i></b> was accepted, that there are differences in the correlations and therefore the markets of the different regions do not behave the same.</li>\n", + " <li>For <b>RQ2</b>, <b><i>H<sub>2.2</sub></i></b> was accepted, that <i>Critic Score</i> has a stronger statistically significant correlation and explains more of Global Sales.</li>\n", + " <li>For <b>RQ3</b>, it can be said that the Publisher variable explains the most of global and japanese sales. It also having a bigger difference in R<sup>2</sup>in between the regions than Genre, is in line with the finding of the descriptive analysis that the two sales variables follow different patterns regarding the publisher.</li> \n", + " </ul>\n", + " \n", + "<br>\n", + "<br>\n", + "Therefore, the key findings of this analysis are, first, that the japanese market behaves differently than the markets of the other regions. This is shown in sales by genre, publisher and the correlation of sales. Second, that the <i>Publisher</i> explains most of global sales, followed by <i>Critic Score</i>, <i>Genre</i> and lastly <i>User Score</i>. This might suggest that user scores are either not a resource potential customers consult when deciding on buying a game or that users score games so wildly different that they are not a good explanatory factor of sales. \n", + " <br>\n", + "Some serious limitations of this analysis are that, as described in section 2, some numbers might not be \"real sale numbers\", but mere estimates and thus the results should only cautiously be applied as interpretations to the real world. Another problem is that around half of all the games have no score ratings (NaN) and 14.5% of User Scores being \"tbd\", making these results less meaningful.\n", + " <br>\n", + "However, the results give an interesting first look at video game sales and could be fleshed out in further investigations.\n", + " </div>" + ] + }, + { + "cell_type": "markdown", + "id": "e8dea374", + "metadata": {}, + "source": [ + "<h2 id=\"Ref\">References</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "4a8c73ab", + "metadata": {}, + "source": [ + "Carless, S. (2008): <a class=\"anchor\" id=\"Carless2008\"></a><i>Analysis: What VGChartz Does (And Doesn't) Do For The Game Biz</i>. https://www.gamedeveloper.com/pc/analysis-what-vgchartz-does-and-doesn-t-do-for-the-game-biz. Last accessed: 04.08.22\n", + "\n", + "Dunford, D., Dale, B., Stylianou, N., Lowther, E., Ahmed, M., & de la Torre Arenas, I. (2020):<a class=\"anchor\" id=\"Dunford2020\"></a><i>Coronavirus: The world in lockdown in maps and charts</i> https://www.bbc.com/news/world-52103747. Last accessed: 02.08.22\n", + "\n", + "Globalwebindex (GWI) (2020):<a class=\"anchor\" id=\"GWI2020\"></a> <i>Coronavirus Research | March 2020 Release 3: Multi-market research</i>. https://www.gwi.com/hubfs/1.%20Coronavirus%20Research%20PDFs/GWI%20coronavirus%20findings%20March%202020%20-%20Multi-Market%20data%20(Release%203).pdf. Last accessed: 02.08.22\n", + "\n", + "GamesIndustry.biz (2020):<a class=\"anchor\" id=\"GamesIndustry2020\"></a> \n", + "<i>What is happening with video game sales during coronavirus\n", + "GamesIndustry.biz analyses the latest figures from GSD.</i> <a href=\"https://www.gamesindustry.biz/what-is-happening-with-video-game-sales-during-coronavirus\">https://www.gamesindustry.biz/what-is-happening-with-video-game-sales-during-coronavirus</a>. Last accessed: 02.08.2022\n", + "\n", + "Kohler, C. (2008):<a class=\"anchor\" id=\"Kohler2008\"></a><i>Why We Don't Reference VGChartz</i> https://www.wired.com/2008/06/why-we-dont-ref/. Last accessed: 04.08.22\n", + "\n", + "Metacritic (2022):<a class=\"anchor\" id=\"Metacritic2022a\"></a><i>About Us</i> https://www.metacritic.com/about-metacritic. Last accessed: 04.08.22\n", + "\n", + "Mirillis (2017):<a class=\"anchor\" id=\"mirillis2017\"></a><i>Complete List of Game Genres</i>. https://mirillis.com/blog/en/complete-list-of-game-genres/. Last accessed: 04.08.22\n", + "\n", + "Oxford Learner's Dictionaries (2022):<a class=\"anchor\" id=\"Oxford2022\"></a><i>Definition of genre noun from the Oxford Advanced Learner's Dictionary</i> https://www.oxfordlearnersdictionaries.com/definition/english/genre. Last accessed: 04.08.22\n", + "\n", + "pwc (2021):<a class=\"anchor\" id=\"pwc2021\"></a><i>German Entertainment and Media Outlook 2021–2025. Fakten, Prognosen und Trends für 13 Segmente der Entertainment- und Medienbranche in Deutschland.</i> https://www.pwc.de/de/technologie-medien-und-telekommunikation/gemo/2021/german-entertainment-media-outlook-2021-2025.pdf. Last accessed: 03.08.22\n", + "\n", + "Simon-Kucher & Partners (2020):<a class=\"anchor\" id=\"SimonKucher2020\"></a><i>Studie zeigt: Gamer weltweit investieren seit der Corona-Krise mehr Zeit und Geld in Videospiele – ein Trend, der bleiben wird</i>. https://www.simon-kucher.com/de/about/media-center/studie-zeigt-gamer-weltweit-investieren-seit-der-corona-krise-mehr-zeit-und-geld-videospiele-ein-trend-der-bleiben-wird. Last accessed: 02.08.22\n", + "\n", + "Statista & Juniper Research (2021):<a class=\"anchor\" id=\"globalmarketvalue20202025\"></a><i>Global video game market value from 2020 to 2025</i>. https://www.statista.com/statistics/292056/video-game-market-value-worldwide/. Last accessed: 03.08.22\n", + "\n", + "VGChartz (2022):<a class=\"anchor\" id=\"vgchartz2022\"></a> <i>About VGChartz</i>. https://www.vgchartz.com/about.php. Last accessed: 04.08.22\n", + "\n", + "Wijman, T. (2020): <a class=\"anchor\" id=\"Wijman2020\"></a><i>The World's 2.7 Billion Gamers Will Spend \\$159.3 Billion on Games in 2020. The Market Will Surpass 200 Billion by 2023</i>. https://newzoo.com/insights/articles/newzoo-games-market-numbers-revenues-and-audience-2020-2023. Last accessed: 03.08.22\n", + "\n", + "Wardrip-Fruin, N. (2021):<a class=\"anchor\" id=\"Wardrip-Fruin2021\"></a><i> Before Pong, There Was Computer Space</i>. https://thereader.mitpress.mit.edu/before-pong-there-was-computer-space/. Last accessed: 03.08.22.\n", + "\n", + "Zegarra, T. (2020): <a class=\"anchor\" id=\"Zegarra2020\"></a>Game Developers vs Game Publishers: What’s the difference?. In: <i>HP Tech Takes/... Exploring today's technology for tomorrow's possibilities</i>. https://www.hp.com/us-en/shop/tech-takes/game-developers-vs-game-publishers. Last accessed 04.08.22\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "d59a6a2e", + "metadata": {}, + "source": [ + "<h2 id=\"appendixa\">Appendix: Platform Acronyms</h2>" + ] + }, + { + "cell_type": "markdown", + "id": "1adaf850", + "metadata": {}, + "source": [ + "Video game platform acronyms in order of frequency in the dataset. All years are the year of the release in the first country the platform was released in:" + ] + }, + { + "cell_type": "markdown", + "id": "3f4b3fe1", + "metadata": {}, + "source": [ + "* PS2 = PlayStation 2 (2000, Sony)\n", + "* DS = Nintendo DS (2004, Nintendo)\n", + "* PS3 = PlayStation 3 (2006, Sony)\n", + "* Wii = Wii (2006, Nintendo)\n", + "* X360 = Xbox 360 (2005, Microsoft)\n", + "* PSP = Playstation Portable (2004, Sony)\n", + "* PS = PlayStation (1994, Sony)\n", + "* PC = Personal Computer (Too many different manifestations to be determined)\n", + "* XB = Xbox (2001, Microsoft)\n", + "* GBA = Game Boy Advance (2001, Nintendo)\n", + "* GC = GameCube (2001, Nintendo)\n", + "* 3DS = Nintendo 3DS (2011, Nintendo)\n", + "* PSV = Playstation Vita (2011, Sony)\n", + "* PS4 = Playstation 4 (2013, Sony)\n", + "* N64 = Nintendo 64 (1996, Nintendo)\n", + "* XOne = Xbobx One (2013, Microsoft)\n", + "* SNES = Super Nintendo Entertainment System (1990, Nintendo)\n", + "* SAT = Sega Saturn (1994, Sega)\n", + "* Wii = Wii U (2012, Nintendo)\n", + "* 2600 = Atari 2600 (1977, Atari Inc.)\n", + "* NES = Nintendo Entertainment System (1983, Nintendo)\n", + "* GB = Game Boy (1989, Nintendo)\n", + "* DC = Dreamcast (1998, Sega)\n", + "* GEN = Sega Genesis (1988, Sega)\n", + "* NG = NeoGeo (1990, SNK)\n", + "* SCD = Sega CD (1991, Sega)\n", + "* WS = WonderSwan (1999, Bandai)\n", + "* 3DO = 3DO Interactive Multiplayer (3DO Company, 1993)\n", + "* TG16 = TurboGrafx-16 (1987, NEC Home Electronics)\n", + "* GG = GameGear (1990, Sega)\n", + "* PCFX = PC-FX (1994, NEC-Hudson Soft)\n", + "\n", + "Source: [VGChartz 2022b](#vgchartz2022b)\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}