Skip to content
Snippets Groups Projects
wikipedia_language_editions.ipynb 380 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "specific-projection",
   "metadata": {},
   "source": [
    "# Comparing Wikipedia Language Editions\n",
    "\n",
    "Wikipedia (as of January 2022) has [more than 300 active language editions](https://en.wikipedia.org/wiki/List_of_Wikipedias). We can compare (some of) these editions quantitatively and qualitatively using the table from https://meta.wikimedia.org/wiki/Wikipedia_article_depth:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "employed-complexity",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Language</th>\n",
       "      <th>Language (local)</th>\n",
       "      <th>Wiki</th>\n",
       "      <th>Depth</th>\n",
       "      <th>Articles</th>\n",
       "      <th>Edits</th>\n",
       "      <th>Total Pages</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Acehnese</td>\n",
       "      <td>Acehnese</td>\n",
       "      <td>ace</td>\n",
       "      <td>6.33</td>\n",
       "      <td>12529</td>\n",
       "      <td>136374</td>\n",
       "      <td>26393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Afrikaans</td>\n",
       "      <td>Afrikaans</td>\n",
       "      <td>af</td>\n",
       "      <td>46.23</td>\n",
       "      <td>101532</td>\n",
       "      <td>2470797</td>\n",
       "      <td>367945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Alemannic</td>\n",
       "      <td>Alemannisch</td>\n",
       "      <td>als</td>\n",
       "      <td>29.55</td>\n",
       "      <td>28246</td>\n",
       "      <td>1000517</td>\n",
       "      <td>68387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Amharic</td>\n",
       "      <td>አማርኛ</td>\n",
       "      <td>am</td>\n",
       "      <td>34.08</td>\n",
       "      <td>15005</td>\n",
       "      <td>367942</td>\n",
       "      <td>45968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Aragonese</td>\n",
       "      <td>Aragonés</td>\n",
       "      <td>an</td>\n",
       "      <td>59.6</td>\n",
       "      <td>40848</td>\n",
       "      <td>1799537</td>\n",
       "      <td>123442</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Language Language (local) Wiki  Depth Articles    Edits Total Pages\n",
       "0   Acehnese         Acehnese  ace   6.33    12529   136374       26393\n",
       "1  Afrikaans        Afrikaans   af  46.23   101532  2470797      367945\n",
       "2  Alemannic      Alemannisch  als  29.55    28246  1000517       68387\n",
       "3    Amharic             አማርኛ   am  34.08    15005   367942       45968\n",
       "4  Aragonese         Aragonés   an   59.6    40848  1799537      123442"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# read all (!) HTML tables from the Wikipedia page on article depth\n",
    "tables = pd.read_html(\"https://meta.wikimedia.org/wiki/Wikipedia_article_depth\")\n",
    "\n",
    "# check which table is the one we are looking for\n",
    "tables[3].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "attempted-sponsorship",
   "metadata": {},
   "source": [
    "Apparently, the fourth table (with index 3) is the table we are looking for. Let's extract it into a dataframe:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "romance-removal",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Language</th>\n",
       "      <th>Language (local)</th>\n",
       "      <th>Wiki</th>\n",
       "      <th>Depth</th>\n",
       "      <th>Articles</th>\n",
       "      <th>Edits</th>\n",
       "      <th>Total Pages</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Acehnese</td>\n",
       "      <td>Acehnese</td>\n",
       "      <td>ace</td>\n",
       "      <td>6.33</td>\n",
       "      <td>12529</td>\n",
       "      <td>136374</td>\n",
       "      <td>26393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Afrikaans</td>\n",
       "      <td>Afrikaans</td>\n",
       "      <td>af</td>\n",
       "      <td>46.23</td>\n",
       "      <td>101532</td>\n",
       "      <td>2470797</td>\n",
       "      <td>367945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Alemannic</td>\n",
       "      <td>Alemannisch</td>\n",
       "      <td>als</td>\n",
       "      <td>29.55</td>\n",
       "      <td>28246</td>\n",
       "      <td>1000517</td>\n",
Loading
Loading full blame...