From bef8275c9e6dbca4240bcbe92ead5b1cfb3655f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20J=C3=A4schke?= <jaeschke@l3s.de> Date: Wed, 25 Jan 2023 17:00:39 +0100 Subject: [PATCH] added regex example --- README.org | 2 + notebooks/wikipedia_regex.ipynb | 169 ++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 notebooks/wikipedia_regex.ipynb diff --git a/README.org b/README.org index 56270db..9737745 100644 --- a/README.org +++ b/README.org @@ -32,6 +32,8 @@ So far, notebooks are listed by difficulty, indicated by stars (☆ = simple, - [[file:notebooks/Twitter.ipynb][Twitter]] :: analysing Twitter data (raw JSON from Twitter's API) (☆) - [[file:notebooks/wikipedia_language_editions.ipynb][Wikipedia language editions]] :: plotting the depth and number of articles of different Wikipedia language editions (☆) +- [[file:notebooks/wikipedia_regex.ipynb][Regular expressions]] :: simple information extraction from Wikipedia + articles (☆) - [[file:notebooks/amazon_reviews.ipynb][Amazon reviews]] :: crawling web sites with [[https://scrapy.org/][Scrapy]], processing JSON data, basic statistics and visualisation (☆☆) - [[file:notebooks/Art.ipynb][Art]] :: Creating computer-generated art by translation, scaling and diff --git a/notebooks/wikipedia_regex.ipynb b/notebooks/wikipedia_regex.ipynb new file mode 100644 index 0000000..37ddd91 --- /dev/null +++ b/notebooks/wikipedia_regex.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anwendung von regulären Ausdrücken am Beispiel von Wikipedia-Seiten" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IP-Adressen\n", + "\n", + "Wir wollen alle IP-Adressen aus der [zugehörigen Wikipedia-Seite](https://de.wikipedia.org/wiki/IP-Adresse) extrahieren. Wir verwenden dazu den regulären Ausdruck `[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+`, der zwar recht allgemein ist und viele Muster zulässt die keine IP-Adressen sind (z.B. 1000.1000.1000.1000), für dieses Beispiel aber völlig ausreicht:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0.0.0\n", + "0.0.0.0\n", + "0.255.255.255\n", + "000.000.000.003\n", + "10.0.0.0\n", + "10.0.0.0\n", + "10.255.255.255\n", + "100.127.255.255\n", + "100.64.0.0\n", + "100.64.0.0\n", + "127.0.0.0\n", + "127.0.0.0\n", + "127.0.0.0\n", + "127.0.0.1\n", + "127.0.0.1\n", + "127.255.255.255\n", + "128.0.0.0\n", + "128.0.0.0\n", + "128.0.255.255\n", + "13.0.0.0\n", + "14.0.0.0\n", + "14.0.0.0\n", + "14.255.255.255\n", + "169.254.0.0\n", + "169.254.0.0\n", + "169.254.255.255\n", + "172.16.0.0\n", + "172.16.0.0\n", + "172.31.255.255\n", + "191.255.0.0\n", + "191.255.0.0\n", + "191.255.255.255\n", + "192.0.0.0\n", + "192.0.0.0\n", + "192.0.0.0\n", + "192.0.0.0\n", + "192.0.0.255\n", + "192.0.0.7\n", + "192.0.2.0\n", + "192.0.2.0\n", + "192.0.2.255\n", + "192.0.2.42\n", + "192.168.0.0\n", + "192.168.0.0\n", + "192.168.0.254\n", + "192.168.0.254\n", + "192.168.0.254\n", + "192.168.0.254\n", + "192.168.0.255\n", + "192.168.0.255\n", + "192.168.1.1\n", + "192.168.1.2\n", + "192.168.2.254\n", + "192.168.2.254\n", + "192.168.2.254\n", + "192.168.255.255\n", + "192.88.99.0\n", + "192.88.99.0\n", + "192.88.99.255\n", + "198.18.0.0\n", + "198.18.0.0\n", + "198.19.255.255\n", + "198.51.100.0\n", + "198.51.100.0\n", + "198.51.100.255\n", + "203.0.113.0\n", + "203.0.113.0\n", + "203.0.113.192\n", + "203.0.113.195\n", + "203.0.113.195\n", + "203.0.113.255\n", + "203.000.113.192\n", + "203.000.113.195\n", + "203.000.113.195\n", + "223.255.255.0\n", + "223.255.255.0\n", + "223.255.255.255\n", + "224.0.0.0\n", + "224.0.0.0\n", + "239.255.255.255\n", + "24.0.0.0\n", + "24.0.0.0\n", + "24.255.255.255\n", + "240.0.0.0\n", + "240.0.0.0\n", + "255.255.255.192\n", + "255.255.255.224\n", + "255.255.255.224\n", + "255.255.255.224\n", + "255.255.255.255\n", + "255.255.255.255\n", + "255.255.255.255\n", + "255.255.255.255\n", + "340.282.366.920\n", + "39.0.0.0\n", + "39.0.0.0\n", + "39.255.255.255\n", + "4.294.967.296\n", + "53.0.0.0\n", + "54.0.0.0\n", + "607.431.768.211\n", + "665.570.793.348\n", + "9.0.0.0\n", + "93.184.216.34\n", + "938.463.463.374\n" + ] + } + ], + "source": [ + "import urllib.request\n", + "import re\n", + "\n", + "with urllib.request.urlopen(\"https://de.wikipedia.org/wiki/IP-Adresse\") as f:\n", + " html = f.read().decode('utf8')\n", + " \n", + " for ipaddress in sorted(re.findall(\"[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+\", html)):\n", + " print(ipaddress)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab