From 9c4e407b463ee1b3d7389668c6e7d94c940685a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20J=C3=A4schke?= <jaeschke@l3s.de> Date: Fri, 23 Oct 2020 13:32:20 +0200 Subject: [PATCH] added a strange type of word similarity visualisation --- Hamming.ipynb | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++ README.org | 3 +- 2 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 Hamming.ipynb diff --git a/Hamming.ipynb b/Hamming.ipynb new file mode 100644 index 0000000..f407e11 --- /dev/null +++ b/Hamming.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Graphing a kind of \"Hamming Similarity\" of strings\n", + "\n", + "This notebook explores a slightly weird similarity measure for strings.\n", + "\n", + "## Equal characters in strings\n", + "\n", + "Given two strings, the idea is to consider the positions where their characters match:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "v = \"Wiesbaden\"\n", + "w = \"Potsdam\"\n", + "# s a – the matching characters of the two strings " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can extract those characters with a loop:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = [] # resulting equal characters\n", + "for i in range(min(map(len, [v, w]))): # loop over the shortest word's length\n", + " if v[i] == w[i]: # check character equality \n", + " m.append(v[i]) # add character\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a function that, given two strings, returns their equal characters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def equal_chars(w, v):\n", + " m = [] # resulting equal characters\n", + " for i in range(min(map(len, [v, w]))): # loop over the shortest word's length\n", + " if v[i] == w[i]: # check character equality \n", + " m.append(v[i]) # add character\n", + " return m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By the way: thanks to Python's [list comprehensions](https://docs.python.org/3/howto/functional.html#generator-expressions-and-list-comprehensions) we can write the body of the function in one line:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def equal_chars(w, v):\n", + " return [v[i] for i in range(min(map(len, [v, w]))) if v[i] == w[i]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity \n", + "\n", + "Now the number of equal characters between two strings defines a similarity measure. For example, the similarity of our two strings is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(equal_chars(v, w))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Graph\n", + "\n", + "Now given a set of strings, for example, the 16 capitals of all German states:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "capitals_de = [\"Berlin\", \"Bremen\", \"Dresden\", \"Düsseldorf\", \"Erfurt\",\n", + " \"Hamburg\", \"Hannover\", \"Kiel\", \"Magdeburg\", \"Mainz\", \"München\",\n", + " \"Potsdam\", \"Saarbrücken\", \"Schwerin\", \"Stuttgart\", \"Wiesbaden\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we can create a graph with the strings as nodes by connecting strings whose similarity is larger than zero, that is, they have at least one position with equal characters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import networkx as nx\n", + "\n", + "def sim_graph(words):\n", + " G = nx.Graph() # resulting graph\n", + "\n", + " for k, v in enumerate(words): # first node\n", + " for l, w in enumerate(words): # second node\n", + " if k > l: # avoid reverse duplicates\n", + " ec = equal_chars(v, w) # equal characters\n", + " sim = len(ec) # similarity\n", + " if sim > 0: # ignore dissimilar words\n", + " G.add_edge(v, w, label=\"\".join(ec), weight=sim) # add edge\n", + " return G" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's compute the graph for our set of capitals:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "g = sim_graph(capitals_de)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A good way to understand a graph is to visualise it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from networkx.drawing.nx_agraph import graphviz_layout\n", + "import matplotlib.pyplot as plt\n", + " \n", + "pos = graphviz_layout(g, prog='dot')\n", + "nx.draw(g, pos, with_labels=True, arrows=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This layout is not the best but we can try to use graphviz directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from networkx.drawing.nx_pydot import write_dot\n", + "import pydot\n", + "from IPython.display import HTML, display\n", + "import random\n", + "\n", + "write_dot(g, \"graph.dot\")\n", + "graph = pydot.graph_from_dot_file(\"graph.dot\")\n", + "graph[0].write_svg(\"graph.svg\")\n", + " \n", + "display(HTML('<img src=\"graph.svg?{0}\">'.format(random.randint(0,2e9)))) " + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/README.org b/README.org index 0e043ed..57c93a4 100644 --- a/README.org +++ b/README.org @@ -30,11 +30,12 @@ their difficulty (☆ = simple, ☆☆ = advanced, ☆☆☆ = sophisticated): data, basic statistics and visualisation (☆☆) - [[file:crawling_a_blog.ipynb][Crawling a blog]] :: crawling web sites, basic text mining, basic statistics and visualisation (☆☆) -- [[file:distances.ipynb][Distances]] :: Comprehensive interactive simulation of recovering +- [[file:distances.ipynb][Distances]] :: comprehensive interactive simulation of recovering information from noisy data (namely, point positions given their noisy distance matrix) (☆☆☆) - [[file:exponential_smoothing.ipynb][Exponential smoothing]] :: using [[https://ipywidgets.readthedocs.io/en/latest/examples/Widget%2520Basics.html][Jupyter's interactive widget]] to explore [[https://en.wikipedia.org/wiki/Exponential_smoothing][exponential smoothing]] (☆) +- [[file:Hamming.ipynb][Hamming]] :: a graph visualising a strange type of word similarity (☆) - [[file:Jupyter-Demo.ipynb][Jupyter demo]] :: demo of some Jupyter features useful for creating learning material (☆) - [[file:statistics_top50faculty.ipynb][Statistics top 50 faculty]] :: exploratory statistical analysis of the -- GitLab