본문 바로가기
Project/Python

7. 부록[기존 의사소통 매체의 문제와 해결방안 탐구 : YouTube 플랫폼을 중심으로]

by sonpang 2021. 10. 29.
반응형
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from datetime import datetime
import pandas as pd
import time
import re

driver = webdriver.Chrome('/Users/손학규/Downloads/chromedriver_win32 (1)/chromedriver')
def getCmt(url):
      youtube_url = url
      #YouTube 주소
      driver.get(youtube_url)
      body = driver.find_element_by_tag_name("body")
      print('시작')

      #기록

      #(스크롤 내리기)
      num_of_pagedowns = 7
      datetime.today()

      while num_of_pagedowns:
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
            num_of_pagedowns -= 1
            
            try:
                  driver.find_element_by_xpath('//*[@id="sort-menu"]').click()
                  #driver.find_element_by_xpath('//*[@id="menu"]/a[@tabindex="-1"]').click()
                  driver.find_element_by_xpath('//*[@id="menu"]/a[2]/paper-item/paper-item-body/div[text()="최근 날짜순"]').click()

            except Exception as e:
                  pass

      num_of_pagedowns = 20
      while num_of_pagedowns:
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
            num_of_pagedowns -= 1
      html = driver.page_source
      result = BeautifulSoup(html,'html.parser')
      #print(html)
      body = result.find("body")
      #print(body)

      title = body.find_all('yt-formatted-string', attrs={'class':'style-scope ytd-video-primary-info-renderer'})
      title1=title[0].get_text()
      print(title1)
      
      thread=body.find_all('ytd-comment-renderer', attrs={'class':'style-scope ytd-comment-thread-renderer'})

      cmtlist=[]
      for items in thread:
            #댓글 내용
            div = items.find_all('yt-formatted-string', attrs={'id':'content-text'})
            #기간(시점)
            div2 = items.select('yt-formatted-string > a')[0].get_text()
            for lists in div:
                  #print(lists)
                  if lists != None:
                        try:
                              cmt = lists.string
                              textcmt = re.sub(r'[^\w]',' ',cmt)
                              cmtlist.append([textcmt, div2])
                              print(textcmt)
                        except TypeError as e:
                              pass
                        
                  else:
                        pass

            #cmtlist.append([textcmt, div2])
            print(div2)
            print('-'*50)
            #write_wb.cell(row = 1, column = len(cmtlist)).value=div2

      print(len(cmtlist))
      result = pd.DataFrame(cmtlist)
      result.to_excel('log.xlsx', encoding='utf-8')


#for i in (urllist['0'])[7:]:
#      print(i)
#      url.append(i)
#      getCmt(i)

url = '댓글을 추출할 url 입력'
getCmt(url)
print(url)
print('finished')

[그림 1] 관련 모듈 설치
[그림 2] 댓글 추출화면 Chromedriver.exe 창과 "자동화된 테스트 소프트웨어에 의해 제어되고 있습니다." 표출

 

반응형

댓글