スクレイピングをしてスプレッドシートに書き込むサンプルテンプレート

ウェブスクレイピングで定期的に取得したちょっとした情報をスプレッドシートに曜日と時間ごとの表にまとめてみるだけのサンプルテンプレートです。例外処理は必要に応じて付け足してください。

import sys
import gspread
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime, time, date
import re

#スプレッドシートを使うための認証（詳しくは参考サイトをご参照ください）
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('認証用のjsonファイルへのパス', scope)
gc = gspread.authorize(credentials)
workbook = gc.open_by_key('ここにスプレッドシートキー')#ワークブックを開いたときのURLのところの変な文字列部分

#時刻と曜日を取得して、対応するセルの行と列の番号を指定
now = datetime.now().time()
if(now.hour < 13 or now.hour > 23): sys.exit()  #13時以降24時前でなければ処理を終了する
row_num = (now.hour - 13)*2 + 3                 #行番号（3列目の13時から30分毎）
if(now.minute > 29) : row_num += 1 
today = date.today()
column_num = (today.weekday() + 1)%7 + 2        #列番号（日曜日スタートにずらして、日曜を2列目からにする）

#スクレイピング
response = requests.get("ここにURL")
soup = BeautifulSoup(response.text, "html.parser")
x_txt = soup.find(class_="抜き出したいクラス名")
x = x_txt.text
x = re.sub('[^0-9]+', "", x)#数字以外消す

#ワークシートに書き込み
worksheet = workbook.worksheet('ワークシート名')
worksheet.update_cell(row_num, column_num, x)

soup.findでダメな場合はsoup.find_allで取得してみると返ってきた配列のどこかに欲しい要素があると思います。

response = requests.get("ここにURL")
soup = BeautifulSoup(response.content, "html.parser")
x_txt = soup.find_all(class_="抜き出したいクラス名")
"""
for x in x_txt:
    print(x.text)
"""
#配列の中身を全部確認して例えば2つめの要素が欲しかったら以降はx_txt[1].textとすればよい
worksheet = workbook.worksheet('ワークシート名')
worksheet.update_cell(row_num, column_num, x_txt[1].text.replace(' ', ''))#空白を消す

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル