[데브코스] TIL 10일차

dev course - DE/TIL

[데브코스] TIL 10일차

nani-jin 2024. 4. 5. 13:27

웹 스크래핑 기초 - 시각화 라이브러리, Seaborn

데이터를 추출하면 끝인가?
→ NO. 유의미하게 가공해야 인사이트를 얻기 편할 것
→ 한 눈에 보여주려면?
→ 시각화(visualization)가 해답
시각화로 결과 떠먹여주기
- seaborn : 파이썬 데이터 시각화 라이브러리로, 다양한 그래프를 그릴 수 있음
- matplotlib : matplotlib을 기반으로 seaborn이 만들어졌으며, matplotlib.pyplot의 속성을 변경해 그래프에 다양한 요소 변경/추가 가능

# 시각화에 필요한 라이브러리, seaborn 불러오기
import seaborn as sns

# Line plot
sns.lineplot(x=[1,3,2,4], y=[0.7,0.2,0.1,0.05])
plt.show()

# Bar plot
sns.barplot(x=[1,3,2,4], y=[0.7,0.2,0.1,0.05])
plt.show()

## plot 속성 변경/추가
# matplotlib.pyplot 불러오기
import matplotlib.pyplot as plt

# 제목 추가
sns.barplot(x=[1,2,3,4], y=[0.7,0.2,0.1,0.05])
plt.title("Bar plot")
plt.show()

# xlabel, ylabel 추가
sns.barplot(x=[1,2,3,4], y=[0.7,0.2,0.1,0.05])
plt.xlabel("X label")
plt.ylabel("Y label")
plt.show()

# 그래프 축 범위 지정
sns.lineplot(x=[1,3,2,4], y=[4,3,2,1])
plt.ylim(0,4)
plt.xlim(0,4)
plt.show()

# 그래프 크기 지정
plt.figure(figsize=(20,10))
sns.lineplot(x=[1,2,3,4], y=[4,3,2,1])
plt.show()

*그 외 그래프 참고 - https://seaborn.pydata.org/examples/index.html

Example gallery — seaborn 0.13.2 documentation

seaborn.pydata.org

실습 - 기상청 날씨 스크래핑

## Jupyter lab에서 진행함
# 스크래핑에 필요한 라이브러리 불러오기
from selenium import webdriver
from selenium.webdriver import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By


# driver를 이용해 기상청 날씨 데이터 가져오기
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.weather.go.kr/w/weather/forecast/short-term.do")
driver.implicitly_wait(5)

temps = driver.find_element(By.ID, "my-tchart").text
temps = [int(i) for i in temps.replace("℃","").split("\n")]

# line plot 이용해 앞으로의 기온 추이 나타내기
import seaborn as sns
sns.lineplot(
    x = [i for i in range(len(temps))],
    y = temps
)

# 받아온 데이터로 꺾은선 그래프 그려보기
import matplotlib.pyplot as plt

plt.ylim(min(temps)-2, max(temps)+2)
plt.title("Expected Temperature from now on")

sns.lineplot(
    x = [i for i in range(len(temps))],
    y = temps
)

실습 - 해시코드 질문태그 빈도 시각화

## bs4와 seaborn을 이용해 질문의 주제 빈도를 보여주는 시각화 진행
# user-agent 추가
user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}


# 필요한 라이브러리 불러오고, 요청 진행
# 질문의 빈도를 체크하는 dictionary 만들고, 빈도 체크
import time
frequency = dict()

import requests
from bs4 import BeautifulSoup

for i in range(1,11):
    res = requests.get("https://hashcode.co.kr/?page={}".format(i), user_agent)
    soup = BeautifulSoup(res.text, "html.parser")
    
    # 1. ul 태그 모두 찾고
    # 2. 1번 안에 있는 li 태그의 text 추출
    ul_tags = soup.find_all("ul","question-tags")
    for ul in ul_tags:
        li_tags = ul.find_all("li")
        for li in li_tags:
            tag = li.text.strip()
            if tag not in frequency:
                frequency[tag] = 1
            else:
                frequency[tag] += 1
    time.sleep(0.5)
print(frequency)


# Counter 사용해 가장 빈도가 높은 value 추출
from collections import Counter
counter = Counter(frequency)
print(counter.most_common(10))


# seaborn 이용해 barplot 그리고,
# figure, xlabel, ylabel, title을 적절하게 설정해 시각화
import seaborn as sns
import matplotlib.pyplot as plt

x = [elem[0] for elem in counter.most_common(10)]
y = [elem[1] for elem in counter.most_common(10)]

plt.figure(figsize=(10,5))
plt.title("Frequency of question in Hashcode")
plt.xlabel("Tag")
plt.ylabel("Frequency")

sns.barplot(x=x, y=y)
plt.show()

Wordcloud
자주 등장하는 텍스트를 중요도나 인기도를 고려해 표현한 것

WordCloud를 만드는 방법
1) KoNLPy 라이브러리로 한국어 문장 전처리
2) Counter를 이용해 빈도수 측정
3) WordCloud를 이용해 시각화

# 시각화에 쓰이는 라이브러리
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 횟수를 기반으로 딕셔너리 생성
from collections import Counter

# 문장에서 명사를 추출하는 형태소 분석 라이브러리
from konlpy.tag import Hannanum

national_anthem = """
동해물과 백두산이 마르고 닳도록
하느님이 보우하사 우리나라 만세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
남산 위에 저 소나무 철갑을 두른 듯
바람 서리 불변함은 우리 기상일세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
가을 하늘 공활한데 높고 구름 없이
밝은 달은 우리 가슴 일편단심일세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
이 기상과 이 맘으로 충성을 다하여
괴로우나 즐거우나 나라 사랑하세
무궁화 삼천리 화려 강산
대한 사람 대한으로 길이 보전하세
"""

# Hannanum 객체를 생성한 후, .nouns()를 통해 명사를 추출
hannanum = Hannanum()
nouns = hannanum.nouns(national_anthem)
words = [noun for noun in nouns if len(noun) > 1]

words[:10]

# counter를 이용해 각 단어의 개수 세기
counter = Counter(words)
print(counter)

# WordCloud를 이용해 텍스트 구름 만들기
wordcloud = WordCloud(
    font_path="/Users/jinjeoh/Library/Fonts/MaruBuri-Regular.ttf",
    background_color="white",
    width=1000,
    height=1000
)
img = wordcloud.generate_from_frequencies(counter)
plt.imshow(img)

실습 - 워드클라우드 만들기(해시코드 질문 키워드)

## user-agent 추가
user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}


## pagination 되어 있는 질문 리스트의 제목을 가져와 리스트 questions에 저장
# https://hashcode.co.kr/?page={i}
# 과도한 요청을 방지하기 위해 0.5초마다 요청을 보냄
import time
import requests
from bs4 import BeautifulSoup

questions = []

for i in range(1,6):
    res = requests.get("https://hashcode.co.kr/?page={}".format(i), {"User-Agent": user_agent})
    soup = BeautifulSoup(res.text, "html.parser")

    parsed_datas = soup.find_all("li", "question-list-item")

    for data in parsed_datas:
        questions.append(data.h4.text.strip())
    time.sleep(0.5)
    
    
    
 ## 텍스트 구름을 그리기 위해 필요한 라이브러리 불러오기
 # 시각화에 쓰이는 라이브러리
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 횟수를 기반으로 딕셔너리 생성
from collections import Counter

# 문장에서 명사를 추출하는 형태소 분석 라이브러리
from konlpy.tag import Hannanum

# Hannanum 객체 생성 후, .nouns()로 명사 추출하기
words = []

hannanum = Hannanum()

for ques in questions:
    nouns = hannanum.nouns(ques) # 1번 반복할때 나온 명사들
    words += nouns # 누적해서 나오는 명사들


## counter로 각 단어의 개수를 세고
counter = Counter(words)


## WordCloud를 이용해 텍스트 구름 만들기
wordcloud = WordCloud(
    font_path="/Users/jinjeoh/Library/Fonts/MaruBuri-Regular.ttf",
    background_color="white",
    width=1000,
    height=1000
)

import matplotlib.pyplot as plt
img = wordcloud.generate_from_frequencies(counter)
plt.imshow(img)