Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Notebooks
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Deploy
Releases
Package Registry
Model registry
Operate
Terraform modules
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Passlida Saila
Notebooks
Commits
8cc7e868
Commit
8cc7e868
authored
3 years ago
by
Prof. Dr. Robert Jäschke
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of scm.cms.hu-berlin.de:ibi/notebooks
parents
9246b00c
5295ea83
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scrape_review_blog.ipynb
+13
-20
13 additions, 20 deletions
scrape_review_blog.ipynb
with
13 additions
and
20 deletions
scrape_review_blog.ipynb
+
13
−
20
View file @
8cc7e868
...
...
@@ -32,7 +32,7 @@
"outputs": [],
"source": [
"# create a scrapy project called \"scraoe_tintenhain\"\n",
"!scrapy startproject scrape_tintenhain"
"
#
!scrapy startproject scrape_tintenhain"
]
},
{
...
...
@@ -56,11 +56,10 @@
"\n",
"\n",
"# go to the correct directory\n",
"pwd = os.getcwd()\n",
"os.chdir(pwd)\n",
"
#
pwd = os.getcwd()\n",
"
#
os.chdir(pwd)\n",
"\n",
"# spider definition:\n",
"class_description = \"\"\"\n",
"# define spider\n",
"import scrapy\n",
"import re\n",
"import logging\n",
...
...
@@ -73,7 +72,7 @@
" text = scrapy.Field()\n",
" \n",
"\n",
"class T
oScrape
Spider
XPath
(scrapy.Spider):\n",
"class T
intenhain
Spider(scrapy.Spider):\n",
" # get a name for the spider\n",
" name = 'tintenhain'\n",
" \n",
...
...
@@ -105,14 +104,7 @@
" # get all text from link \n",
" item[\"text\"] = \" \".join(response.xpath('//p/text()').getall())\n",
" return item\n",
"\"\"\"\n",
"\n",
"# save the spider in the correct directory\n",
"with open(\"scrape_tintenhain/scrape_tintenhain/spiders/scrape_tintenhain_spider.py\",'w') as file_handle:\n",
" file_handle.write(class_description)\n",
"\n",
" \n",
" \n",
"# todo: At the moment, the spider collects all text from \"p\" elements. \n",
"# We need to adjust the parsing such that the spider only collects the review text"
]
...
...
@@ -124,14 +116,15 @@
"outputs": [],
"source": [
"# All you have to do now is run the spider by calling its name from within the project directory!\n",
"print(os.listdir(\".\"))\n",
"os.chdir(\"scrape_tintenhain\")\n",
"\n",
"# List all available spiders\n",
"!scrapy list\n",
"# print(os.listdir(\".\"))\n",
"# os.chdir(\"scrape_tintenhain\")\n",
"from scrapy.crawler import CrawlerProcess\n",
"process = CrawlerProcess({\n",
" 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'\n",
"})\n",
"\n",
"
# run the 't
intenhain
'-s
pider\n",
"
!scrapy crawl tintenhain
"
"
process.crawl(T
intenhain
S
pider
)
\n",
"
process.start()\n
"
]
}
],
...
...
%% Cell type:markdown id: tags:
# Scrape all reviews from a literature blog
Here, we use the python package
`scrapy`
to download all reviews of a literature blog.
-
First, we collect all links that redirect to a review.
-
Then, we get the review text, author name, title of the book and the url and save all of it in a json file.
%% Cell type:code id: tags:
```
# install the package
try:
import scrapy
except:
!pip install scrapy
import scrapy
```
%% Cell type:code id: tags:
```
# create a scrapy project called "scraoe_tintenhain"
!scrapy startproject scrape_tintenhain
#
!scrapy startproject scrape_tintenhain
```
%% Cell type:code id: tags:
```
# keeping tabs on the project directory
import os
import logging
```
%% Cell type:code id: tags:
```
# Writing a spider class, which will follow a set of rules and collect specific items.
# go to the correct directory
pwd = os.getcwd()
os.chdir(pwd)
#
pwd = os.getcwd()
#
os.chdir(pwd)
# spider definition:
class_description = """
# define spider
import scrapy
import re
import logging
class linkItem(scrapy.Item):
# define the fields for your result items
link = scrapy.Field()
author = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
class T
oScrape
Spider
XPath
(scrapy.Spider):
class T
intenhain
Spider(scrapy.Spider):
# get a name for the spider
name = 'tintenhain'
# where to start the crawling
start_urls = [
'https://tintenhain.de/rezensionen/',
]
custom_settings = {"FEEDS": {"tintenhain.json": {"format": "json"}}}
# specifying which information we want to collect from the start_urls using xpath
def parse(self, response):
# authors = response.xpath('//p[contains(@style,"padding-left")]/text()').getall()
# titles = response.xpath('//p[contains(@style,"padding-left")]/a[@href]/text()').getall()
hrefs = response.xpath('//p[contains(@style,"padding-left")]/a/@href').getall()
# loop through all collect href links
for href in hrefs:
# call the function that parses the links to collect all relevant information
yield scrapy.Request(href, callback=self.parse_links)
# specifying which information we want to collect from the collected links using xpath
def parse_links(self, response):
item = linkItem()
# get url
item["link"] = response.url
# get author name
item["author"] = response.xpath('//h1/text()').get().split(":")[0].strip()
# get book title
item["title"] = response.xpath('//h1/text()').get().split(":")[1].strip("[Rezension]").strip()
# get all text from link
item["text"] = " ".join(response.xpath('//p/text()').getall())
return item
"""
# save the spider in the correct directory
with open("scrape_tintenhain/scrape_tintenhain/spiders/scrape_tintenhain_spider.py",'w') as file_handle:
file_handle.write(class_description)
# todo: At the moment, the spider collects all text from "p" elements.
# We need to adjust the parsing such that the spider only collects the review text
```
%% Cell type:code id: tags:
```
# All you have to do now is run the spider by calling its name from within the project directory!
print(os.listdir("."))
os.chdir("scrape_tintenhain")
# List all available spiders
!scrapy list
# print(os.listdir("."))
# os.chdir("scrape_tintenhain")
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
# run the 't
intenhain
'-s
pider
!scrapy crawl tintenhain
process.crawl(T
intenhain
S
pider
)
process.start()
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment