Skip to content
Snippets Groups Projects
Commit 8cc7e868 authored by Prof. Dr. Robert Jäschke's avatar Prof. Dr. Robert Jäschke
Browse files

Merge branch 'master' of scm.cms.hu-berlin.de:ibi/notebooks

parents 9246b00c 5295ea83
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Scrape all reviews from a literature blog
Here, we use the python package `scrapy` to download all reviews of a literature blog.
- First, we collect all links that redirect to a review.
- Then, we get the review text, author name, title of the book and the url and save all of it in a json file.
%% Cell type:code id: tags:
```
# install the package
try:
import scrapy
except:
!pip install scrapy
import scrapy
```
%% Cell type:code id: tags:
```
# create a scrapy project called "scraoe_tintenhain"
!scrapy startproject scrape_tintenhain
# !scrapy startproject scrape_tintenhain
```
%% Cell type:code id: tags:
```
# keeping tabs on the project directory
import os
import logging
```
%% Cell type:code id: tags:
```
# Writing a spider class, which will follow a set of rules and collect specific items.
# go to the correct directory
pwd = os.getcwd()
os.chdir(pwd)
# pwd = os.getcwd()
# os.chdir(pwd)
# spider definition:
class_description = """
# define spider
import scrapy
import re
import logging
class linkItem(scrapy.Item):
# define the fields for your result items
link = scrapy.Field()
author = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
class ToScrapeSpiderXPath(scrapy.Spider):
class TintenhainSpider(scrapy.Spider):
# get a name for the spider
name = 'tintenhain'
# where to start the crawling
start_urls = [
'https://tintenhain.de/rezensionen/',
]
custom_settings = {"FEEDS": {"tintenhain.json": {"format": "json"}}}
# specifying which information we want to collect from the start_urls using xpath
def parse(self, response):
# authors = response.xpath('//p[contains(@style,"padding-left")]/text()').getall()
# titles = response.xpath('//p[contains(@style,"padding-left")]/a[@href]/text()').getall()
hrefs = response.xpath('//p[contains(@style,"padding-left")]/a/@href').getall()
# loop through all collect href links
for href in hrefs:
# call the function that parses the links to collect all relevant information
yield scrapy.Request(href, callback=self.parse_links)
# specifying which information we want to collect from the collected links using xpath
def parse_links(self, response):
item = linkItem()
# get url
item["link"] = response.url
# get author name
item["author"] = response.xpath('//h1/text()').get().split(":")[0].strip()
# get book title
item["title"] = response.xpath('//h1/text()').get().split(":")[1].strip("[Rezension]").strip()
# get all text from link
item["text"] = " ".join(response.xpath('//p/text()').getall())
return item
"""
# save the spider in the correct directory
with open("scrape_tintenhain/scrape_tintenhain/spiders/scrape_tintenhain_spider.py",'w') as file_handle:
file_handle.write(class_description)
# todo: At the moment, the spider collects all text from "p" elements.
# We need to adjust the parsing such that the spider only collects the review text
```
%% Cell type:code id: tags:
```
# All you have to do now is run the spider by calling its name from within the project directory!
print(os.listdir("."))
os.chdir("scrape_tintenhain")
# List all available spiders
!scrapy list
# print(os.listdir("."))
# os.chdir("scrape_tintenhain")
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
# run the 'tintenhain'-spider
!scrapy crawl tintenhain
process.crawl(TintenhainSpider)
process.start()
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment