Merge branch 'master' of scm.cms.hu-berlin.de:ibi/notebooks

8cc7e868 · Prof. Dr. Robert Jäschke · 9246b00c · 5295ea83 · 8cc7e868
Commit 8cc7e868 authored 3 years ago by Prof. Dr. Robert Jäschke
--- a/scrape_review_blog.ipynb
+++ b/scrape_review_blog.ipynb
@@ -32,7 +32,7 @@
   "outputs": [],
   "source": [
    "# create a scrapy project called \"scraoe_tintenhain\"\n",
-    "!scrapy startproject scrape_tintenhain"
+    "# !scrapy startproject scrape_tintenhain"
   ]
  },
  {
@@ -56,11 +56,10 @@
    "\n",
    "\n",
    "# go to the correct directory\n",
-    "pwd = os.getcwd()\n",
-    "os.chdir(pwd)\n",
+    "# pwd = os.getcwd()\n",
+    "# os.chdir(pwd)\n",
    "\n",
-    "# spider definition:\n",
-    "class_description = \"\"\"\n",
+    "# define spider\n",
    "import scrapy\n",
    "import re\n",
    "import logging\n",
@@ -73,7 +72,7 @@
    "    text = scrapy.Field()\n",
    "    \n",
    "\n",
-    "class ToScrapeSpiderXPath(scrapy.Spider):\n",
+    "class TintenhainSpider(scrapy.Spider):\n",
    "    # get a name for the spider\n",
    "    name = 'tintenhain'\n",
    "    \n",
@@ -105,14 +104,7 @@
    "        # get all text from link \n",
    "        item[\"text\"] = \" \".join(response.xpath('//p/text()').getall())\n",
    "        return item\n",
-    "\"\"\"\n",
    "\n",
-    "# save the spider in the correct directory\n",
-    "with open(\"scrape_tintenhain/scrape_tintenhain/spiders/scrape_tintenhain_spider.py\",'w') as file_handle:\n",
-    "    file_handle.write(class_description)\n",
-    "\n",
-    "    \n",
-    "    \n",
    "# todo: At the moment, the spider collects all text from \"p\" elements. \n",
    "# We need to adjust the parsing such that the spider only collects the review text"
   ]
@@ -124,14 +116,15 @@
   "outputs": [],
   "source": [
    "# All you have to do now is run the spider by calling its name from within the project directory!\n",
-    "print(os.listdir(\".\"))\n",
-    "os.chdir(\"scrape_tintenhain\")\n",
-    "\n",
-    "# List all available spiders\n",
-    "!scrapy list\n",
+    "# print(os.listdir(\".\"))\n",
+    "# os.chdir(\"scrape_tintenhain\")\n",
+    "from scrapy.crawler import CrawlerProcess\n",
+    "process = CrawlerProcess({\n",
+    "    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'\n",
+    "})\n",
    "\n",
-    "# run the 'tintenhain'-spider\n",
-    "!scrapy crawl tintenhain "
+    "process.crawl(TintenhainSpider)\n",
+    "process.start()\n"
   ]
  }
 ],

 %% Cell type:markdown id: tags:

 # Scrape all reviews from a literature blog

 Here, we use the python package `scrapy` to download all reviews of a literature blog.
 - First, we collect all links that redirect to a review.
 - Then, we get the review text, author name, title of the book and the url and save all of it in a json file.

 %% Cell type:code id: tags:

 ``` 
 # install the package
 try:
    import scrapy
 except:
    !pip install scrapy
    import scrapy
 ```

 %% Cell type:code id: tags:

 ``` 
 # create a scrapy project called "scraoe_tintenhain"
-!scrapy startproject scrape_tintenhain
+# !scrapy startproject scrape_tintenhain
 ```

 %% Cell type:code id: tags:

 ``` 
 # keeping tabs on the project directory
 import os
 import logging
 ```

 %% Cell type:code id: tags:

 ``` 
 # Writing a spider class, which will follow a set of rules and collect specific items.


 # go to the correct directory
-pwd = os.getcwd()
-os.chdir(pwd)
+# pwd = os.getcwd()
+# os.chdir(pwd)

-# spider definition:
-class_description = """
+# define spider
 import scrapy
 import re
 import logging

 class linkItem(scrapy.Item):
    # define the fields for your result items
    link = scrapy.Field()
    author = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()


-class ToScrapeSpiderXPath(scrapy.Spider):
+class TintenhainSpider(scrapy.Spider):
    # get a name for the spider
    name = 'tintenhain'

    # where to start the crawling
    start_urls = [
        'https://tintenhain.de/rezensionen/',
    ]
    custom_settings = {"FEEDS": {"tintenhain.json": {"format": "json"}}}
    # specifying which information we want to collect from the start_urls using xpath
    def parse(self, response):
        # authors = response.xpath('//p[contains(@style,"padding-left")]/text()').getall()
        # titles = response.xpath('//p[contains(@style,"padding-left")]/a[@href]/text()').getall()
        hrefs = response.xpath('//p[contains(@style,"padding-left")]/a/@href').getall()

        # loop through all collect href links
        for href in hrefs:
            # call the function that parses the links to collect all relevant information
            yield scrapy.Request(href, callback=self.parse_links)

    # specifying which information we want to collect from the collected links using xpath
    def parse_links(self, response):
        item = linkItem()
        # get url
        item["link"] = response.url
        # get author name
        item["author"] = response.xpath('//h1/text()').get().split(":")[0].strip()
        # get book title
        item["title"] = response.xpath('//h1/text()').get().split(":")[1].strip("[Rezension]").strip()
        # get all text from link
        item["text"] = " ".join(response.xpath('//p/text()').getall())
        return item
-"""
-
-# save the spider in the correct directory
-with open("scrape_tintenhain/scrape_tintenhain/spiders/scrape_tintenhain_spider.py",'w') as file_handle:
-    file_handle.write(class_description)
-
-

 # todo: At the moment, the spider collects all text from "p" elements.
 # We need to adjust the parsing such that the spider only collects the review text
 ```

 %% Cell type:code id: tags:

 ``` 
 # All you have to do now is run the spider by calling its name from within the project directory!
-print(os.listdir("."))
-os.chdir("scrape_tintenhain")
-
-# List all available spiders
-!scrapy list
+# print(os.listdir("."))
+# os.chdir("scrape_tintenhain")
+from scrapy.crawler import CrawlerProcess
+process = CrawlerProcess({
+    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
+})

-# run the 'tintenhain'-spider
-!scrapy crawl tintenhain
+process.crawl(TintenhainSpider)
+process.start()
 ```