Topic 5: BeautifulSoup으로 HTML 파싱하기 🍲

🎯 학습 목표

가져온 HTML에서 원하는 정보만 쏙쏙 뽑아내는 방법을 배워요!

BeautifulSoup이 무엇인지 이해하기
HTML을 파싱하는 방법 익히기
태그 찾기와 선택하기
속성과 텍스트 추출하기
실전 크롤링 해보기

🍜 BeautifulSoup이란?

BeautifulSoup은 HTML을 쉽게 분석할 수 있게 해주는 라이브러리예요.

마치 국물에서 건더기를 건져내는 것처럼, HTML에서 원하는 정보를 뽑아낼 수 있어요!

왜 BeautifulSoup이 필요한가요?


# HTML 텍스트만으로는 어려워요 😵
html = "<div><p>안녕하세요</p></div>"
# 여기서 "안녕하세요"만 어떻게 뽑아내지?
 
# BeautifulSoup을 사용하면 쉬워요! 😊
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
text = soup.find('p').text  # "안녕하세요"

🚀 BeautifulSoup 시작하기

기본 사용법


from bs4 import BeautifulSoup
 
# HTML 문자열
html = """
<html>
<body>
    <h1>웹 크롤링 배우기</h1>
    <p>파이썬으로 웹 크롤링을 해봐요!</p>
    <ul>
        <li>HTML 이해하기</li>
        <li>requests 사용하기</li>
        <li>BeautifulSoup 활용하기</li>
    </ul>
</body>
</html>
"""
 
# BeautifulSoup 객체 만들기
soup = BeautifulSoup(html, 'html.parser')
 
# 이제 soup에서 원하는 것을 찾을 수 있어요!
print(soup.prettify())  # HTML을 예쁘게 정리해서 보여줘요

파서(Parser) 선택하기


# 1. html.parser (기본 - 별도 설치 불필요)
soup = BeautifulSoup(html, 'html.parser')
 
# 2. lxml (빠르고 강력 - 별도 설치 필요)
# pip install lxml
soup = BeautifulSoup(html, 'lxml')
 
# 대부분 html.parser로 충분해요!

🔍 태그 찾기 - find()와 find_all()

find() - 첫 번째 태그 찾기


from bs4 import BeautifulSoup
 
html = """
<div>
    <p>첫 번째 문단입니다.</p>
    <p>두 번째 문단입니다.</p>
    <p>세 번째 문단입니다.</p>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# 첫 번째 p 태그만 찾기
first_p = soup.find('p')
print(first_p)  # <p>첫 번째 문단입니다.</p>
print(first_p.text)  # 첫 번째 문단입니다.

find_all() - 모든 태그 찾기


# 모든 p 태그 찾기
all_p = soup.find_all('p')
print(f"p 태그 개수: {len(all_p)}")
 
# 반복문으로 하나씩 출력
for i, p in enumerate(all_p, 1):
    print(f"{i}번째: {p.text}")

태그 이름으로 직접 접근


# find('태그')와 같은 효과
title = soup.h1  # soup.find('h1')과 동일
print(title.text)  # 웹 크롤링 배우기
 
# 첫 번째 것만 반환해요
first_li = soup.li  # 첫 번째 li만
print(first_li.text)  # HTML 이해하기

🏷️ 속성으로 태그 찾기

class로 찾기


html = """
<div>
    <p class="important">중요한 내용입니다.</p>
    <p class="normal">일반 내용입니다.</p>
    <p class="important">이것도 중요해요!</p>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# class로 찾기 (class_를 사용해요!)
important = soup.find('p', class_='important')
print(important.text)  # 중요한 내용입니다.
 
# 모든 important 클래스 찾기
all_important = soup.find_all('p', class_='important')
for p in all_important:
    print(f"중요: {p.text}")

id로 찾기


html = """
<div>
    <h1 id="main-title">메인 제목</h1>
    <p id="intro">소개 문단입니다.</p>
    <p id="content">본문 내용입니다.</p>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# id로 찾기
title = soup.find('h1', id='main-title')
print(title.text)  # 메인 제목
 
intro = soup.find('p', id='intro')
print(intro.text)  # 소개 문단입니다.

딕셔너리로 속성 지정


# 여러 조건을 한번에 지정
html = """
<a href="https://python.org" target="_blank">파이썬</a>
<a href="https://google.com" target="_self">구글</a>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# 속성을 딕셔너리로 지정
link = soup.find('a', {'target': '_blank'})
print(link.text)  # 파이썬
print(link['href'])  # https://python.org

🎯 CSS 선택자 사용하기

CSS 선택자를 알고 있다면 더 쉽게 찾을 수 있어요!

select_one()과 select()


html = """
<article>
    <h2 class="title">뉴스 제목</h2>
    <div class="content">
        <p>첫 번째 문단</p>
        <p class="highlight">중요한 문단</p>
        <p>마지막 문단</p>
    </div>
    <ul class="tags">
        <li>파이썬</li>
        <li>크롤링</li>
        <li>데이터</li>
    </ul>
</article>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# CSS 선택자로 찾기
title = soup.select_one('.title')  # class="title"
print(f"제목: {title.text}")
 
# 여러 개 찾기
tags = soup.select('.tags li')  # class="tags" 안의 모든 li
for tag in tags:
    print(f"태그: {tag.text}")
 
# 계층 구조 활용
highlight = soup.select_one('.content .highlight')
print(f"중요: {highlight.text}")

CSS 선택자 활용 예시


# 다양한 CSS 선택자
soup.select('p')              # 모든 p 태그
soup.select('.class-name')    # class="class-name"
soup.select('#id-name')       # id="id-name"
soup.select('div p')          # div 안의 모든 p
soup.select('div > p')        # div의 직계 자식 p
soup.select('[href]')         # href 속성이 있는 모든 태그
soup.select('[href="http://example.com"]')  # 특정 href 값

📤 정보 추출하기

텍스트 추출


html = """
<div>
    <h1>  제목입니다  </h1>
    <p>문단 <strong>강조</strong> 텍스트</p>
    <span>   공백이 많아요   </span>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# .text - 모든 텍스트 추출
h1 = soup.find('h1')
print(f"text: '{h1.text}'")  # '  제목입니다  '
print(f"strip: '{h1.text.strip()}'")  # '제목입니다'
 
# .string - 직접 포함된 텍스트만
p = soup.find('p')
print(f"text: {p.text}")  # 문단 강조 텍스트
print(f"string: {p.string}")  # None (자식 태그가 있어서)
 
# .get_text() - 구분자 지정 가능
print(p.get_text())  # 문단 강조 텍스트
print(p.get_text(separator=' | '))  # 문단 | 강조 | 텍스트

속성 추출


html = """
<div>
    <a href="https://python.org" title="파이썬 공식 사이트">Python</a>
    <img src="logo.jpg" alt="로고" width="100" height="50">
    <div class="box blue" data-id="123">박스</div>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# 속성 가져오기 - 딕셔너리처럼
link = soup.find('a')
print(link['href'])  # https://python.org
print(link['title'])  # 파이썬 공식 사이트
 
# get() 메서드 사용 (없으면 None)
print(link.get('href'))  # https://python.org
print(link.get('class'))  # None
 
# 모든 속성 보기
img = soup.find('img')
print(img.attrs)  # {'src': 'logo.jpg', 'alt': '로고', ...}
 
# 여러 클래스가 있는 경우
box = soup.find('div', class_='box')
print(box['class'])  # ['box', 'blue'] - 리스트로 반환
print(box['data-id'])  # 123

🌲 HTML 트리 탐색하기

부모, 자식, 형제 찾기


html = """
<div class="family">
    <p>첫째</p>
    <p>둘째</p>
    <p>셋째</p>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
second = soup.find_all('p')[1]  # 둘째 선택
 
# 부모 찾기
parent = second.parent
print(f"부모: {parent.name}")  # div
print(f"부모 클래스: {parent['class']}")  # ['family']
 
# 형제 찾기
next_sibling = second.find_next_sibling()
print(f"다음 형제: {next_sibling.text}")  # 셋째
 
prev_sibling = second.find_previous_sibling()
print(f"이전 형제: {prev_sibling.text}")  # 첫째
 
# 모든 형제들
siblings = second.find_all_next_siblings()
for s in siblings:
    print(f"뒤의 형제: {s.text}")

자식 요소 접근


html = """
<ul>
    <li>항목 1</li>
    <li>항목 2</li>
    <li>항목 3</li>
</ul>
"""
 
soup = BeautifulSoup(html, 'html.parser')
ul = soup.find('ul')
 
# children - 직계 자식들 (반복자)
for child in ul.children:
    if child.name:  # 태그만 (텍스트 노드 제외)
        print(child.text)
 
# descendants - 모든 자손들
for descendant in ul.descendants:
    if hasattr(descendant, 'name'):
        print(descendant)

💪 실전 예제: 뉴스 사이트 크롤링

뉴스 기사 추출하기


from bs4 import BeautifulSoup
 
# 가상의 뉴스 HTML
news_html = """
<div class="news-container">
    <article class="news-item">
        <h2 class="title">파이썬 4.0 출시 예정</h2>
        <p class="summary">혁신적인 기능들이 추가될 예정입니다.</p>
        <div class="meta">
            <span class="date">2024-01-15</span>
            <span class="author">김기자</span>
            <span class="category">IT</span>
        </div>
    </article>
    
    <article class="news-item">
        <h2 class="title">AI 개발 동향</h2>
        <p class="summary">인공지능 기술이 빠르게 발전하고 있습니다.</p>
        <div class="meta">
            <span class="date">2024-01-14</span>
            <span class="author">이기자</span>
            <span class="category">AI</span>
        </div>
    </article>
</div>
"""
 
soup = BeautifulSoup(news_html, 'html.parser')
 
# 모든 뉴스 기사 추출
news_list = []
 
articles = soup.find_all('article', class_='news-item')
print(f"📰 총 {len(articles)}개의 기사를 찾았습니다.\n")
 
for article in articles:
    # 각 기사에서 정보 추출
    news = {
        'title': article.find('h2', class_='title').text,
        'summary': article.find('p', class_='summary').text,
        'date': article.find('span', class_='date').text,
        'author': article.find('span', class_='author').text,
        'category': article.find('span', class_='category').text
    }
    news_list.append(news)
    
    # 출력
    print(f"제목: {news['title']}")
    print(f"요약: {news['summary']}")
    print(f"날짜: {news['date']} | 기자: {news['author']} | 분류: {news['category']}")
    print("-" * 40)

쇼핑몰 상품 정보 추출


# 쇼핑몰 상품 HTML
shop_html = """
<div class="products">
    <div class="product-card" data-id="001">
        <img src="book1.jpg" alt="파이썬 입문">
        <h3 class="name">파이썬 입문서</h3>
        <p class="price">
            <span class="original">30,000원</span>
            <span class="sale">25,000원</span>
        </p>
        <div class="rating">
            <span class="stars">★★★★☆</span>
            <span class="count">(42)</span>
        </div>
    </div>
    
    <div class="product-card" data-id="002">
        <img src="book2.jpg" alt="크롤링 마스터">
        <h3 class="name">웹 크롤링 완벽 가이드</h3>
        <p class="price">
            <span class="sale">35,000원</span>
        </p>
        <div class="rating">
            <span class="stars">★★★★★</span>
            <span class="count">(128)</span>
        </div>
    </div>
</div>
"""
 
soup = BeautifulSoup(shop_html, 'html.parser')
 
# 상품 정보 추출
products = []
 
for card in soup.find_all('div', class_='product-card'):
    product = {
        'id': card['data-id'],
        'name': card.find('h3', class_='name').text,
        'image': card.find('img')['src'],
        'rating_stars': card.find('span', class_='stars').text,
        'rating_count': card.find('span', class_='count').text.strip('()'),
    }
    
    # 가격 처리 (할인가/정가)
    price_elem = card.find('p', class_='price')
    original = price_elem.find('span', class_='original')
    sale = price_elem.find('span', class_='sale')
    
    if original:
        product['original_price'] = original.text
        product['sale_price'] = sale.text
        # 할인율 계산
        original_num = int(original.text.replace(',', '').replace('원', ''))
        sale_num = int(sale.text.replace(',', '').replace('원', ''))
        discount = int((1 - sale_num/original_num) * 100)
        product['discount'] = f"{discount}%"
    else:
        product['sale_price'] = sale.text
        product['discount'] = "0%"
    
    products.append(product)
 
# 결과 출력
print("🛍️ 상품 목록\n")
for p in products:
    print(f"[{p['id']}] {p['name']}")
    print(f"  가격: {p['sale_price']}", end='')
    if 'original_price' in p:
        print(f" (원가: {p['original_price']}, {p['discount']} 할인)")
    else:
        print()
    print(f"  평점: {p['rating_stars']} ({p['rating_count']}명)")
    print()

🎓 실전 크롤링 팁

1. 안전한 태그 찾기


# 태그가 없을 수도 있어요!
title = soup.find('h1')
if title:
    print(title.text)
else:
    print("제목을 찾을 수 없어요")
 
# 또는 try-except 사용
try:
    title = soup.find('h1').text
except AttributeError:
    title = "제목 없음"

2. 여러 클래스 처리


html = '<div class="box blue large">내용</div>'
soup = BeautifulSoup(html, 'html.parser')
 
# 방법 1: 하나의 클래스로 찾기
box = soup.find('div', class_='box')
 
# 방법 2: CSS 선택자로 여러 클래스
box = soup.select_one('.box.blue.large')
 
# 방법 3: 클래스 리스트 확인
div = soup.find('div')
if 'box' in div['class'] and 'blue' in div['class']:
    print("box와 blue 클래스를 모두 가지고 있어요")

3. 정규표현식 활용


import re
 
html = """
<div>
    <p id="para1">문단 1</p>
    <p id="para2">문단 2</p>
    <p id="note1">노트 1</p>
</div>
"""
 
soup = BeautifulSoup(html, 'html.parser')
 
# id가 para로 시작하는 모든 태그
paras = soup.find_all('p', id=re.compile('^para'))
for p in paras:
    print(p.text)

💡 퀴즈: BeautifulSoup 이해도 체크

Q1. class=“title”인 첫 번째 요소를 찾는 코드는?


soup.find('???', class_='???')

💡 정답 확인


soup.find('태그명', class_='title')
# 또는 태그 상관없이
soup.find(class_='title')
# CSS 선택자 사용
soup.select_one('.title')

Q2. 모든 링크의 href를 출력하려면?

💡 정답 확인


links = soup.find_all('a')
for link in links:
    if link.get('href'):
        print(link['href'])

✅ BeautifulSoup 마스터 체크리스트

BeautifulSoup 객체를 만들 수 있나요?find()와 find_all()을 사용할 수 있나요?class, id 등 속성으로 찾을 수 있나요?CSS 선택자를 사용할 수 있나요?텍스트를 추출할 수 있나요?속성값을 추출할 수 있나요?부모, 자식, 형제 태그를 찾을 수 있나요?안전하게 정보를 추출할 수 있나요?

🚀 마무리

축하합니다! 이제 여러분은 웹 크롤링의 핵심 기술을 모두 배웠어요! 🎉

배운 내용 정리:

HTML/CSS 기초: 웹페이지의 구조 이해
개발 환경 설정: venv와 라이브러리 설치
requests: 웹페이지 가져오기
BeautifulSoup: HTML에서 정보 추출하기

이제 할 수 있는 것들:

웹사이트에서 데이터 수집
뉴스 기사 스크래핑
상품 정보 수집
자동화된 웹 데이터 수집

⚠️ 항상 기억하세요:

robots.txt 확인하기
서버에 부담 주지 않기 (time.sleep() 사용)
저작권과 이용약관 준수하기
개인정보 보호하기

다음 유닛에서는 수집한 데이터를 웹 앱으로 만들어 보여주는 방법을 배워볼게요! 🌐