From 9c4e407b463ee1b3d7389668c6e7d94c940685a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20J=C3=A4schke?= <jaeschke@l3s.de>
Date: Fri, 23 Oct 2020 13:32:20 +0200
Subject: [PATCH] added a strange type of word similarity visualisation

---
 Hamming.ipynb | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++
 README.org    |   3 +-
 2 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100644 Hamming.ipynb

diff --git a/Hamming.ipynb b/Hamming.ipynb
new file mode 100644
index 0000000..f407e11
--- /dev/null
+++ b/Hamming.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Graphing a kind of \"Hamming Similarity\" of strings\n",
+    "\n",
+    "This notebook explores a slightly weird similarity measure for strings.\n",
+    "\n",
+    "## Equal characters in strings\n",
+    "\n",
+    "Given two strings, the idea is to consider the positions where their characters match:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "v = \"Wiesbaden\"\n",
+    "w = \"Potsdam\"\n",
+    "#       s a     â€“ the matching characters of the two strings "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can extract those characters with a loop:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = []                                 # resulting equal characters\n",
+    "for i in range(min(map(len, [v, w]))): # loop over the shortest word's length\n",
+    "    if v[i] == w[i]:                   # check character equality \n",
+    "        m.append(v[i])                 # add character\n",
+    "m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create a function that, given two strings, returns their equal characters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def equal_chars(w, v):\n",
+    "    m = []                                 # resulting equal characters\n",
+    "    for i in range(min(map(len, [v, w]))): # loop over the shortest word's length\n",
+    "        if v[i] == w[i]:                   # check character equality \n",
+    "            m.append(v[i])                 # add character\n",
+    "    return m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By the way: thanks to Python's [list comprehensions](https://docs.python.org/3/howto/functional.html#generator-expressions-and-list-comprehensions) we can write the body of the function in one line:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def equal_chars(w, v):\n",
+    "    return [v[i] for i in range(min(map(len, [v, w]))) if v[i] == w[i]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Similarity \n",
+    "\n",
+    "Now the number of equal characters between two strings defines a similarity measure. For example, the similarity of our two strings is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(equal_chars(v, w))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Graph\n",
+    "\n",
+    "Now given a set of strings, for example, the 16 capitals of all German states:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "capitals_de = [\"Berlin\", \"Bremen\", \"Dresden\", \"DÃ¼sseldorf\", \"Erfurt\",\n",
+    "         \"Hamburg\", \"Hannover\", \"Kiel\", \"Magdeburg\", \"Mainz\", \"MÃ¼nchen\",\n",
+    "         \"Potsdam\", \"SaarbrÃ¼cken\", \"Schwerin\", \"Stuttgart\", \"Wiesbaden\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "we can create a graph with the strings as nodes by connecting strings whose similarity is larger than zero, that is, they have at least one position with equal characters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import networkx as nx\n",
+    "\n",
+    "def sim_graph(words):\n",
+    "    G = nx.Graph()                                                  # resulting graph\n",
+    "\n",
+    "    for k, v in enumerate(words):                                   # first node\n",
+    "        for l, w in enumerate(words):                               # second node\n",
+    "            if k > l:                                               # avoid reverse duplicates\n",
+    "                ec = equal_chars(v, w)                              # equal characters\n",
+    "                sim = len(ec)                                       # similarity\n",
+    "                if sim > 0:                                         # ignore dissimilar words\n",
+    "                    G.add_edge(v, w, label=\"\".join(ec), weight=sim) # add edge\n",
+    "    return G"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's compute the graph for our set of capitals:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "g = sim_graph(capitals_de)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A good way to understand a graph is to visualise it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from networkx.drawing.nx_agraph import graphviz_layout\n",
+    "import matplotlib.pyplot as plt\n",
+    "    \n",
+    "pos = graphviz_layout(g, prog='dot')\n",
+    "nx.draw(g, pos, with_labels=True, arrows=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This layout is not the best but we can try to use graphviz directly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from networkx.drawing.nx_pydot import write_dot\n",
+    "import pydot\n",
+    "from IPython.display import HTML, display\n",
+    "import random\n",
+    "\n",
+    "write_dot(g, \"graph.dot\")\n",
+    "graph = pydot.graph_from_dot_file(\"graph.dot\")\n",
+    "graph[0].write_svg(\"graph.svg\")\n",
+    "    \n",
+    "display(HTML('<img src=\"graph.svg?{0}\">'.format(random.randint(0,2e9))))    "
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.org b/README.org
index 0e043ed..57c93a4 100644
--- a/README.org
+++ b/README.org
@@ -30,11 +30,12 @@ their difficulty (â˜† = simple, â˜†â˜† = advanced, â˜†â˜†â˜† = sophisticated):
      data, basic statistics and visualisation (â˜†â˜†)
 - [[file:crawling_a_blog.ipynb][Crawling a blog]] :: crawling web sites, basic text mining, basic
      statistics and visualisation (â˜†â˜†)
-- [[file:distances.ipynb][Distances]] :: Comprehensive interactive simulation of recovering
+- [[file:distances.ipynb][Distances]] :: comprehensive interactive simulation of recovering
      information from noisy data (namely, point positions given their
      noisy distance matrix) (â˜†â˜†â˜†)
 - [[file:exponential_smoothing.ipynb][Exponential smoothing]] :: using [[https://ipywidgets.readthedocs.io/en/latest/examples/Widget%2520Basics.html][Jupyter's interactive widget]] to
      explore [[https://en.wikipedia.org/wiki/Exponential_smoothing][exponential smoothing]] (â˜†)
+- [[file:Hamming.ipynb][Hamming]] :: a graph visualising a strange type of word similarity (â˜†)
 - [[file:Jupyter-Demo.ipynb][Jupyter demo]] :: demo of some Jupyter features useful for creating
      learning material (â˜†)
 - [[file:statistics_top50faculty.ipynb][Statistics top 50 faculty]] :: exploratory statistical analysis of the
-- 
GitLab