#collapse_show
# Import libraries used below
import requests
import urllib.request
import urllib.parse
import time
import io
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os
Introduction
As the summary explains, this blog post will very quickly explain how to automatically download French government data on hospitalization and testing pertaining to COVID⁻19.
Data sources
- Hospitalization data
The various datasets concerning hospitalization data are found here.
If you follow the link you will find 4 csv datasets concerning hospitalization data along with 5 other csv files with metadata and documentation.
- Testing data
The various datasets concerning testing data are found here.
If you follow the link you will find 2 csv datasets concerning testing data along with 2 other csv files with metadata and documentation.
In both cases we want to download the first of the links since they contain the pertinent daily updated data (do have a look manually at the metadata and documentation files to make sure this is what you want).
Code
Getting the main page
Let’s first have a look ath the main landing page that I provided above.
# Store URL for each page
= 'https://www.data.gouv.fr/fr/datasets/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/'
url_cases = 'https://www.data.gouv.fr/fr/datasets/donnees-relatives-aux-tests-de-depistage-de-covid-19-realises-en-laboratoire-de-ville/' url_tests
# Get response for each URL
= requests.get(url_cases)
response_cases = requests.get(url_tests) response_tests
The response here should be 200 (see life of codes here).
print(response_cases, response_tests)
<Response [200]> <Response [200]>
# Save the actual content of the page returned with BeautifulSoup
= BeautifulSoup(response_cases.text, "html.parser")
soupcases = BeautifulSoup(response_tests.text, "html.parser") souptests
# Let's look at the links in the main page (for testing data - if you want cases, replace souptests with soupcases below)
for i in range(len(souptests.find_all('a', class_="btn btn-sm btn-primary"))):
print(souptests.find_all('a', class_="btn btn-sm btn-primary")[i].get('href'))
None
https://www.data.gouv.fr/fr/datasets/r/b4ea7b4b-b7d1-4885-a099-71852291ff20
None
https://www.data.gouv.fr/fr/datasets/r/72050bc8-9959-4bb1-88a0-684ff8db5fb5
None
https://www.data.gouv.fr/fr/datasets/r/971c5cbd-cd80-4492-b2b3-c3deff8c1f5e
None
https://www.data.gouv.fr/fr/datasets/r/db378f2a-83a1-40fd-a16c-06c4c8c3535d
https://www.data.gouv.fr/fr/datasets/r/49ba79e6-0153-40b1-b050-821e102959eb
None
https://www.data.gouv.fr/fr/datasets/r/59e82d52-e07a-4ae8-9a49-2d1fd2d2ec21
We see that the petrtinent file in each cases (testing or hospitalization data) are the first links in their page. So we save only this one as donw below:
# If we want to save that first URL we can do as follows
= soupcases.find_all('a', class_="btn btn-sm btn-primary")[1].get('href')
casescsvurl = souptests.find_all('a', class_="btn btn-sm btn-primary")[1].get('href') testscsvurl
Getting the CSV data
We now have the URL for the CSV files we want so we’ll do similar steps as above to download these files.
# Similaraly as above, requests.get the CSV URL:
= requests.get(testscsvurl)
rectests = requests.get(casescsvurl) reccases
What to do with the CSV data
Now that you have the data, what to do with it?
It depends on your purpose I guess: * First write the data to a CSV file which you then read * Directly read the data
By first writing the CSV file to drive
# This will write the data into cases.csv file
# Of course you need to replace the actual path to the folder you want in the code below:
with open(os.path.join("/path/to/folder", "cases.csv"), 'wb') as f:
f.write(reccases.content)
# Same thing for testing data
# This will write the data into tests.csv file
# Of course you need to replace the actual path to the folder you want in the code below:
with open(os.path.join("/path/to/folder", "tests.csv"), 'wb') as f:
f.write(rectests.content)
# You can then read that csv file to use in your data analysis:
= pd.read_csv('tests.csv', sep=';', dtype={'dep': str, 'jour': str, 'clage_covid': str, 'nb_test': int, 'nb_pos': int, 'nb_test_h': int, 'nb_pos_h': int, 'nb_test_f': int, 'nb_pos_f': int}, parse_dates = ['jour'])
tests = pd.read_csv('cases.csv', sep=';', dtype={'dep': str, 'jour': str, 'hosp': int, 'rea': int, 'rad': int, 'dc': int}, parse_dates = ['jour']) cases
Note in the code above I had previously looked through the raw csv data to underdstand how to parse it.
Directly reading the data (bypassing the writing CSV file step)
= pd.read_csv(io.StringIO(requests.get(casescsvurl).content.decode('utf-8')), sep=';', dtype={'dep': str, 'jour': str, 'hosp': int, 'rea': int, 'rad': int, 'dc': int}, parse_dates = ['jour'])
cases = pd.read_csv(io.StringIO(requests.get(testscsvurl).content.decode('utf-8')), sep=';', dtype={'dep': str, 'jour': str, 'hosp': int, 'rea': int, 'rad': int, 'dc': int}, parse_dates = ['jour']) tests
Other stuff
Parsing/Converting URI into readable format
It sometimes happends that links are provided in URI (URL symbols encoded into % symbols…)
You generally need to convert those back to correct URLs, example below:
# Example URI
= 'https%3A%2F%2Fstatic.data.gouv.fr%2Fresources%2Fdonnees-hospitalieres-relatives-a-lepidemie-de-covid-19%2F20200505-190040%2Fdonnees-hospitalieres-covid19-2020-05-05-19h00.csv' testurl
# Convert with following line:
urllib.parse.unquote(testurl)
'https://static.data.gouv.fr/resources/donnees-hospitalieres-relatives-a-lepidemie-de-covid-19/20200505-190040/donnees-hospitalieres-covid19-2020-05-05-19h00.csv'
A quick look at French testing data from scratch
Let’s quickly see how, from scratch, we can use code above to scrape testing data and plot it quickly.
Note the data only includes city testing centers and does not include hospital testing.
# Use main page URL
= 'https://www.data.gouv.fr/fr/datasets/donnees-relatives-aux-tests-de-depistage-de-covid-19-realises-en-laboratoire-de-ville/'
url_tests = requests.get(url_tests) response_tests
# Find correct CSV file URL
= BeautifulSoup(response_tests.text, "html.parser")
souptests = souptests.find_all('a', class_="btn btn-sm btn-primary")[1].get('href') testscsvurl
# Read CSV file into tests variable
= requests.get(testscsvurl)
rectests = pd.read_csv(io.StringIO(requests.get(testscsvurl).content.decode('utf-8')), sep=';', dtype={'dep': str, 'jour': str, 'hosp': int, 'rea': int, 'rad': int, 'dc': int}, parse_dates = ['jour']) tests
#hide
!pip install plotly==4.6.0
Collecting plotly==4.6.0
Downloading https://files.pythonhosted.org/packages/15/90/918bccb0ca60dc6d126d921e2c67126d75949f5da777e6b18c51fb12603d/plotly-4.6.0-py2.py3-none-any.whl (7.1MB)
|████████████████████████████████| 7.2MB 2.4MB/s
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly==4.6.0) (1.3.3)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from plotly==4.6.0) (1.12.0)
Installing collected packages: plotly
Found existing installation: plotly 4.4.1
Uninstalling plotly-4.4.1:
Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.6.0
#collapse_hide
import plotly.express as px
import plotly.graph_objects as go
# We want overall testing for France, se we groupby Day and sum: (filtering for clage_covid = 0 means not differentiated between age groups)
= tests[tests.clage_covid=='0'].groupby(['jour']).sum()
df
= go.Figure(data=[
fig ='Positive tests', x=df.index, y=df.nb_pos, marker_color='red'),
go.Bar(name='Total tests', x=df.index, y=df.nb_test, marker_color='blue')
go.Bar(name
])
fig.update_layout(= 'Daily positive and total testing data in France',
title= 'Date',
xaxis_title = 'Number of tests (total and positive)',
yaxis_title ='group'
barmode
)
fig.show()
Conclusion
Very easy to incorporate this into a python script to automate.
This is only the very basic of scraping, a lot more could be done, maybe in another blog post.