실전: 시카고 샌드위치 맛집 소개 사이트에 접근¶
In [1]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
In [2]:
url_base = 'https://www.chicagomag.com'
url_sub = '/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
# headers={'User-Agent': "Mozilla/5.0"} -> 크롤링 방지로 인한 봇이 아니에요
url = Request(url_base + url_sub, headers={'User-Agent': "Mozilla/5.0"})
html = urlopen(url)
In [43]:
soup = BeautifulSoup(html, 'html.parser')
# soup
In [42]:
# soup.find_all('div', 'sammy')
In [5]:
len(soup.find_all('div', 'sammy'))
Out[5]:
50
In [6]:
soup.find_all('div', 'sammy')[0]
Out[6]:
<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>
접근한 웹 페이지에서 원하는 데이터 추출하고 정리¶
In [7]:
# 맛집 1위 data 가져오기
tmp_one = soup.find_all('div', 'sammy')[0]
type(tmp_one)
Out[7]:
bs4.element.Tag
In [8]:
# 1~50위 순위 class는 sammyRank로 되어 있음
tmp_one.find(class_='sammyRank')
Out[8]:
<div class="sammyRank">1</div>
In [9]:
tmp_one.find(class_='sammyRank').get_text()
Out[9]:
'1'
In [10]:
# sammyRank 안의 클래스 추출
tmp_one.find(class_='sammyListing')
Out[10]:
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
In [11]:
# sammyListing 안에 텍스트만 가져오기
tmp_one.find(class_='sammyListing').get_text()
Out[11]:
'BLT\nOld Oak Tap\nRead more '
In [12]:
# 링크 추출
tmp_one.find('a')['href']
Out[12]:
'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'
데이터 안의 '\n' 제거¶
In [13]:
# 정규식(Regular Express)
import re
tmp_string = tmp_one.find(class_='sammyListing').get_text()
# split : 지정한 특정 패턴이 일치하면 분리
re.split(('\n'), tmp_string)
print(re.split(('\n'), tmp_string)[0]) # 메뉴 이름
print(re.split(('\n'), tmp_string)[1]) # 가게 이름
BLT
Old Oak Tap
절대 경로로 잡힌 url은 그대로 두고 상대 경로로 잡힌 url은 절대 경로로 변경¶
In [14]:
from urllib.parse import urljoin
In [15]:
rank = [] # 순위
main_menu = [] # 메인 메뉴 이름
cafe_name = [] # 카페 이름
url_add = [] # 접근 주소
list_soup = soup.find_all('div', 'sammy')
for item in list_soup:
rank.append(item.find(class_='sammyRank').get_text())
tmp_string = item.find(class_='sammyListing').get_text()
main_menu.append(re.split(('\n'), tmp_string)[0])
cafe_name.append(re.split(('\n'), tmp_string)[1])
url_add.append(urljoin(url_base, item.find('a')['href']))
In [16]:
rank[:5]
Out[16]:
['1', '2', '3', '4', '5']
In [17]:
cafe_name[:5]
Out[17]:
['Old Oak Tap', 'Au Cheval', 'Xoco', 'Al’s Deli', 'Publican Quality Meats']
주피터 노트북에서 상태 진행바를 쉽게 만들어 주는 모듈¶
- pip install tqdm
In [18]:
from tqdm import tqdm_notebook
import time
rank = [] # 순위
main_menu = [] # 메인 메뉴 이름
cafe_name = [] # 카페 이름
url_add = [] # 접근 주소
list_soup = soup.find_all('div', 'sammy')
bar_total = tqdm_notebook(list_soup)
for item in bar_total:
rank.append(item.find(class_='sammyRank').get_text())
tmp_string = item.find(class_='sammyListing').get_text()
main_menu.append(re.split(('\n'), tmp_string)[0])
cafe_name.append(re.split(('\n'), tmp_string)[1])
url_add.append(urljoin(url_base, item.find('a')['href']))
# 크롤링 중 너무 빨리 긁어오는(누락 위험) 것을 방지하기 위해 0.05 만큼 대기
time.sleep(0.05)
C:\Users\Playdata\AppData\Local\Temp\ipykernel_8040\2583802767.py:9: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
bar_total = tqdm_notebook(list_soup)
0%| | 0/50 [00:00<?, ?it/s]
In [19]:
len(rank),len(main_menu),len(cafe_name),len(url_add)
Out[19]:
(50, 50, 50, 50)
In [20]:
import pandas as pd
data={'Rank':rank, 'Menu':main_menu, 'Cafe':cafe_name, 'URL':url_add}
df = pd.DataFrame(data)
df.head()
Out[20]:
Rank | Menu | Cafe | URL | |
---|---|---|---|---|
0 | 1 | BLT | Old Oak Tap | https://www.chicagomag.com/Chicago-Magazine/No... |
1 | 2 | Fried Bologna | Au Cheval | https://www.chicagomag.com/Chicago-Magazine/No... |
2 | 3 | Woodland Mushroom | Xoco | https://www.chicagomag.com/Chicago-Magazine/No... |
3 | 4 | Roast Beef | Al’s Deli | https://www.chicagomag.com/Chicago-Magazine/No... |
4 | 5 | PB&L | Publican Quality Meats | https://www.chicagomag.com/Chicago-Magazine/No... |
In [21]:
df.to_csv('../../data/best_sandwiches_chicago11.csv', sep=',',
encoding='UTF-8')
다수의 웹 페이지에 접근 원하는 정보 가져오기¶
In [22]:
df = pd.read_csv('../../data/best_sandwiches_chicago11.csv', index_col=0)
df.head()
Out[22]:
Rank | Menu | Cafe | URL | |
---|---|---|---|---|
0 | 1 | BLT | Old Oak Tap | https://www.chicagomag.com/Chicago-Magazine/No... |
1 | 2 | Fried Bologna | Au Cheval | https://www.chicagomag.com/Chicago-Magazine/No... |
2 | 3 | Woodland Mushroom | Xoco | https://www.chicagomag.com/Chicago-Magazine/No... |
3 | 4 | Roast Beef | Al’s Deli | https://www.chicagomag.com/Chicago-Magazine/No... |
4 | 5 | PB&L | Publican Quality Meats | https://www.chicagomag.com/Chicago-Magazine/No... |
In [23]:
df['URL'][0]
Out[23]:
'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'
In [44]:
# url = Request(df['URL'][0], headers={'User-Agent':'Mozilla/5.0'})
# html = urlopen(url)
# soup_tmp = BeautifulSoup(html, "html.parser")
# # soup_tmp
In [25]:
print(soup_tmp.find('p','addy'))
<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>
In [26]:
price_tmp = soup_tmp.find('p','addy').get_text()
price_tmp
Out[26]:
'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'
In [27]:
price_tmp.split()
Out[27]:
['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']
In [28]:
# 10 뒤에 . 제거
price_tmp.split()[0][:-1]
Out[28]:
'$10'
In [29]:
price_tmp.split()[1:-2]
Out[29]:
['2109', 'W.', 'Chicago', 'Ave.,']
In [30]:
' '.join(price_tmp.split()[1:-2])
Out[30]:
'2109 W. Chicago Ave.,'
샌드위치페이지 50개에 접근¶
In [31]:
from tqdm import tqdm_notebook
import time
price = []
address = []
for n in tqdm_notebook(df.index):
url = Request(df['URL'][n], headers={'User-Agent':'Mozilla/5.0'})
html = urlopen(url)
soup_tmp = BeautifulSoup(html, "lxml")
gettings = soup_tmp.find('p','addy').get_text()
price.append(gettings.split()[0][:-1])
address.append(' '.join(gettings.split()[1:-2]))
C:\Users\Playdata\AppData\Local\Temp\ipykernel_8040\390610935.py:7: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
for n in tqdm_notebook(df.index):
0%| | 0/50 [00:00<?, ?it/s]
50개 웹 페이지에 대한 정보 가져오기¶
In [32]:
df['Price'] = price
df['Address'] = address
df = df.loc[:, ['Rank', 'Cafe', 'Menu', 'Price', 'Address']]
df.set_index('Rank', inplace=True)
df.head()
Out[32]:
Cafe | Menu | Price | Address | |
---|---|---|---|---|
Rank | ||||
1 | Old Oak Tap | BLT | $10 | 2109 W. Chicago Ave., |
2 | Au Cheval | Fried Bologna | $9 | 800 W. Randolph St., |
3 | Xoco | Woodland Mushroom | $9.50 | 445 N. Clark St., |
4 | Al’s Deli | Roast Beef | $9.40 | 914 Noyes St., Evanston, |
5 | Publican Quality Meats | PB&L | $10 | 825 W. Fulton Mkt., |
In [33]:
df.to_csv('../../data/best_sandwiches_chicago22.csv', sep=',',
encoding='UTF-8')
맛집 위치를 지도에 표시¶
In [34]:
import folium
import pandas as pd
import googlemaps
import numpy as np
In [35]:
df=pd.read_csv('../../data/03. best_sandwiches_list_chicago2.csv', index_col=0)
df.head()
Out[35]:
Cafe | Menu | Price | Address | |
---|---|---|---|---|
Rank | ||||
1 | Old Oak Tap | BLT | $10 | 2109 W. Chicago Ave., |
2 | Au Cheval | Fried Bologna | $9 | 800 W. Randolph St., |
3 | Xoco | Woodland Mushroom | $9.50 | 445 N. Clark St., |
4 | Al’s Deli | Roast Beef | $9.40 | 914 Noyes St., Evanston, |
5 | Publican Quality Meats | PB&L | $10 | 825 W. Fulton Mkt., |
In [40]:
# gmaps_key = "본인 API"
# gmaps = googlemaps.Client(key=gmaps_key)
In [37]:
lat = []
lng = []
for n in tqdm_notebook(df.index):
if df['Address'][n] != 'Multiple':
target_name = df['Address'][n]+', '+'Chicago'
gmaps_output = gmaps.geocode(target_name)
location_output = gmaps_output[0].get('geometry')
lat.append(location_output['location']['lat'])
lng.append(location_output['location']['lng'])
else:
lat.append(np.nan)
lng.append(np.nan)
df['lat'] = lat
df['lng'] = lng
df.head()
C:\Users\Playdata\AppData\Local\Temp\ipykernel_8040\1548284059.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
for n in tqdm_notebook(df.index):
0%| | 0/50 [00:00<?, ?it/s]
Out[37]:
Cafe | Menu | Price | Address | lat | lng | |
---|---|---|---|---|---|---|
Rank | ||||||
1 | Old Oak Tap | BLT | $10 | 2109 W. Chicago Ave., | 41.895558 | -87.679967 |
2 | Au Cheval | Fried Bologna | $9 | 800 W. Randolph St., | 41.884639 | -87.647590 |
3 | Xoco | Woodland Mushroom | $9.50 | 445 N. Clark St., | 41.890523 | -87.630783 |
4 | Al’s Deli | Roast Beef | $9.40 | 914 Noyes St., Evanston, | 42.058322 | -87.683748 |
5 | Publican Quality Meats | PB&L | $10 | 825 W. Fulton Mkt., | 41.886604 | -87.648536 |
In [38]:
mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()],
zoom_start=11)
for n in df.index:
if df['Address'][n] != 'Multiple':
folium.Marker([df['lat'][n], df['lng'][n]],
popup=df['Cafe'][n]).add_to(mapping)
mapping
Out[38]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [39]:
# 샌드위치 가게에 MarkerCluster를 추가
from folium.plugins import MarkerCluster
mc = MarkerCluster().add_to(mapping)
mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()],
zoom_start=11)
for n in df.index:
if df['Address'][n] != 'Multiple':
folium.Marker([df['lat'][n], df['lng'][n]],
popup=df['Cafe'][n]).add_to(mapping)
mc.add_child(folium.Marker([df['lat'][n],
df['lng'][n]],
popup=df['Cafe'][n])).add_to(mapping)
mapping
Out[39]:
Make this Notebook Trusted to load map: File -> Trust Notebook
반응형
'데이터분석' 카테고리의 다른 글
[23.07.03] 데이터 시각화(기름 제일 싼 곳) - 22(1) (0) | 2023.07.03 |
---|---|
[23.06.30] 데이터 시각화(따릉이) - 21(3) (0) | 2023.06.30 |
[23.06.30] 웹 크롤링 - 21(1) (0) | 2023.06.30 |
[23.06.29] 데이터 시각화 - 20(1) (0) | 2023.06.29 |
[23.06.28] 데이터 시각화(CCTV) - 19(2) (0) | 2023.06.28 |