From bef8275c9e6dbca4240bcbe92ead5b1cfb3655f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=A4schke?= <jaeschke@l3s.de>
Date: Wed, 25 Jan 2023 17:00:39 +0100
Subject: [PATCH] added regex example

---
 README.org                      |   2 +
 notebooks/wikipedia_regex.ipynb | 169 ++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 notebooks/wikipedia_regex.ipynb

diff --git a/README.org b/README.org
index 56270db..9737745 100644
--- a/README.org
+++ b/README.org
@@ -32,6 +32,8 @@ So far, notebooks are listed by difficulty, indicated by stars (☆ = simple, 
 - [[file:notebooks/Twitter.ipynb][Twitter]] :: analysing Twitter data (raw JSON from Twitter's API) (☆)
 - [[file:notebooks/wikipedia_language_editions.ipynb][Wikipedia language editions]] :: plotting the depth and number of
   articles of different Wikipedia language editions (☆)
+- [[file:notebooks/wikipedia_regex.ipynb][Regular expressions]] :: simple information extraction from Wikipedia
+  articles (☆)
 - [[file:notebooks/amazon_reviews.ipynb][Amazon reviews]] :: crawling web sites with [[https://scrapy.org/][Scrapy]], processing JSON
   data, basic statistics and visualisation (☆☆)
 - [[file:notebooks/Art.ipynb][Art]] :: Creating computer-generated art by translation, scaling and
diff --git a/notebooks/wikipedia_regex.ipynb b/notebooks/wikipedia_regex.ipynb
new file mode 100644
index 0000000..37ddd91
--- /dev/null
+++ b/notebooks/wikipedia_regex.ipynb
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Anwendung von regulären Ausdrücken am Beispiel von Wikipedia-Seiten"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## IP-Adressen\n",
+    "\n",
+    "Wir wollen alle IP-Adressen aus der [zugehörigen Wikipedia-Seite](https://de.wikipedia.org/wiki/IP-Adresse) extrahieren. Wir verwenden dazu den regulären Ausdruck `[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+`, der zwar recht allgemein ist und viele Muster zulässt die keine IP-Adressen sind (z.B. 1000.1000.1000.1000), für dieses Beispiel aber völlig ausreicht:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0.0.0\n",
+      "0.0.0.0\n",
+      "0.255.255.255\n",
+      "000.000.000.003\n",
+      "10.0.0.0\n",
+      "10.0.0.0\n",
+      "10.255.255.255\n",
+      "100.127.255.255\n",
+      "100.64.0.0\n",
+      "100.64.0.0\n",
+      "127.0.0.0\n",
+      "127.0.0.0\n",
+      "127.0.0.0\n",
+      "127.0.0.1\n",
+      "127.0.0.1\n",
+      "127.255.255.255\n",
+      "128.0.0.0\n",
+      "128.0.0.0\n",
+      "128.0.255.255\n",
+      "13.0.0.0\n",
+      "14.0.0.0\n",
+      "14.0.0.0\n",
+      "14.255.255.255\n",
+      "169.254.0.0\n",
+      "169.254.0.0\n",
+      "169.254.255.255\n",
+      "172.16.0.0\n",
+      "172.16.0.0\n",
+      "172.31.255.255\n",
+      "191.255.0.0\n",
+      "191.255.0.0\n",
+      "191.255.255.255\n",
+      "192.0.0.0\n",
+      "192.0.0.0\n",
+      "192.0.0.0\n",
+      "192.0.0.0\n",
+      "192.0.0.255\n",
+      "192.0.0.7\n",
+      "192.0.2.0\n",
+      "192.0.2.0\n",
+      "192.0.2.255\n",
+      "192.0.2.42\n",
+      "192.168.0.0\n",
+      "192.168.0.0\n",
+      "192.168.0.254\n",
+      "192.168.0.254\n",
+      "192.168.0.254\n",
+      "192.168.0.254\n",
+      "192.168.0.255\n",
+      "192.168.0.255\n",
+      "192.168.1.1\n",
+      "192.168.1.2\n",
+      "192.168.2.254\n",
+      "192.168.2.254\n",
+      "192.168.2.254\n",
+      "192.168.255.255\n",
+      "192.88.99.0\n",
+      "192.88.99.0\n",
+      "192.88.99.255\n",
+      "198.18.0.0\n",
+      "198.18.0.0\n",
+      "198.19.255.255\n",
+      "198.51.100.0\n",
+      "198.51.100.0\n",
+      "198.51.100.255\n",
+      "203.0.113.0\n",
+      "203.0.113.0\n",
+      "203.0.113.192\n",
+      "203.0.113.195\n",
+      "203.0.113.195\n",
+      "203.0.113.255\n",
+      "203.000.113.192\n",
+      "203.000.113.195\n",
+      "203.000.113.195\n",
+      "223.255.255.0\n",
+      "223.255.255.0\n",
+      "223.255.255.255\n",
+      "224.0.0.0\n",
+      "224.0.0.0\n",
+      "239.255.255.255\n",
+      "24.0.0.0\n",
+      "24.0.0.0\n",
+      "24.255.255.255\n",
+      "240.0.0.0\n",
+      "240.0.0.0\n",
+      "255.255.255.192\n",
+      "255.255.255.224\n",
+      "255.255.255.224\n",
+      "255.255.255.224\n",
+      "255.255.255.255\n",
+      "255.255.255.255\n",
+      "255.255.255.255\n",
+      "255.255.255.255\n",
+      "340.282.366.920\n",
+      "39.0.0.0\n",
+      "39.0.0.0\n",
+      "39.255.255.255\n",
+      "4.294.967.296\n",
+      "53.0.0.0\n",
+      "54.0.0.0\n",
+      "607.431.768.211\n",
+      "665.570.793.348\n",
+      "9.0.0.0\n",
+      "93.184.216.34\n",
+      "938.463.463.374\n"
+     ]
+    }
+   ],
+   "source": [
+    "import urllib.request\n",
+    "import re\n",
+    "\n",
+    "with urllib.request.urlopen(\"https://de.wikipedia.org/wiki/IP-Adresse\") as f:\n",
+    "    html = f.read().decode('utf8')\n",
+    "    \n",
+    "    for ipaddress in sorted(re.findall(\"[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+\", html)):\n",
+    "        print(ipaddress)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab