Any tricks to fix encoding problems without a metric-ton of .replace()? Python3 Chrome-Driver BS4












1















The print() command prints the scrapped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.



Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.



If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.



#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options


def Close():
driver.stop_client()
driver.close()
driver.quit()

CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'

# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}

options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)

# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html

# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded

# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter

# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file

# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()

CopyChapter()
#EOF


Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.



At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then please teach me.



Windows 10
Python 3.7.0



There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.










share|improve this question



























    1















    The print() command prints the scrapped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.



    Tried various forms of .encode(encoding='...',errors='...') to no avail.
    When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.



    If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.



    #! python3
    import os
    import os.path
    from os import path
    import requests
    import bs4 as BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.chrome.options import Options


    def Close():
    driver.stop_client()
    driver.close()
    driver.quit()

    CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'

    # start raw html
    NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
    BaseURL = 'https://novelplanet.com'
    url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}

    options = Options()
    options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
    #options.add_argument("--headless") # Runs Chrome in headless mode.
    #options.add_argument('--no-sandbox') # Bypass OS security model
    #options.add_argument('--disable-gpu') # applicable to windows os only
    options.add_argument('start-maximized') #
    options.add_argument('disable-infobars')
    #options.add_argument("--disable-extensions")
    driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
    driver.get(url)

    # wait for title not be equal to "Please wait 5 seconds..."
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
    # End raw html

    # Start get first chapter html coded
    i=0
    for chapterLink in soup.find_all(class_='rowChapter'):
    i+=1
    cLink = chapterLink.find('a').contents[0].strip()
    print(driver.title)
    # end get first chapter html coded

    # start navigate to first chapter
    link = driver.find_element_by_link_text(cLink)
    link.click()
    # end navigate to first chapter

    # start copy of chapter and add to a file
    def CopyChapter():
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
    print(driver.title)
    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
    readables = soup.find(id='divReadContent')
    text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
    name = driver.title
    file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
    print(file_name)
    #print(text) # <-- This shows the correct text in the shell with no errors
    with open(file_name,'a+') as file:
    print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
    global lastURL
    lastURL = driver.current_url
    NextChapter()
    # end copy of chapter and add to a file

    # start goto next chapter if exists then return to copy chapter else Close()
    def NextChapter():
    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
    a=0
    main = soup.find(class_='wrapper')
    for container in main.find_all(class_='container'):
    a+=1
    row = container.find(class_='row')
    b=0
    for chapterLink in row.find_all(class_='4u 12u(small)'):
    b+=1
    cLink = chapterLink.find('a').contents[0].strip()
    link = driver.find_element_by_link_text(cLink)
    link.click()
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
    global currentURL
    currentURL = driver.current_url
    if currentURL != lastURL:
    CopyChapter()
    else:
    print('Finished!!!')
    Close()
    # end goto next chapter if exists then return to copy chapter else Close()

    CopyChapter()
    #EOF


    Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.



    At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then please teach me.



    Windows 10
    Python 3.7.0



    There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.










    share|improve this question

























      1












      1








      1








      The print() command prints the scrapped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.



      Tried various forms of .encode(encoding='...',errors='...') to no avail.
      When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.



      If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.



      #! python3
      import os
      import os.path
      from os import path
      import requests
      import bs4 as BeautifulSoup
      from selenium import webdriver
      from selenium.webdriver.support.wait import WebDriverWait
      from selenium.webdriver.chrome.options import Options


      def Close():
      driver.stop_client()
      driver.close()
      driver.quit()

      CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'

      # start raw html
      NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
      BaseURL = 'https://novelplanet.com'
      url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}

      options = Options()
      options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
      #options.add_argument("--headless") # Runs Chrome in headless mode.
      #options.add_argument('--no-sandbox') # Bypass OS security model
      #options.add_argument('--disable-gpu') # applicable to windows os only
      options.add_argument('start-maximized') #
      options.add_argument('disable-infobars')
      #options.add_argument("--disable-extensions")
      driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
      driver.get(url)

      # wait for title not be equal to "Please wait 5 seconds..."
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      # End raw html

      # Start get first chapter html coded
      i=0
      for chapterLink in soup.find_all(class_='rowChapter'):
      i+=1
      cLink = chapterLink.find('a').contents[0].strip()
      print(driver.title)
      # end get first chapter html coded

      # start navigate to first chapter
      link = driver.find_element_by_link_text(cLink)
      link.click()
      # end navigate to first chapter

      # start copy of chapter and add to a file
      def CopyChapter():
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
      print(driver.title)
      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      readables = soup.find(id='divReadContent')
      text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
      name = driver.title
      file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
      print(file_name)
      #print(text) # <-- This shows the correct text in the shell with no errors
      with open(file_name,'a+') as file:
      print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
      global lastURL
      lastURL = driver.current_url
      NextChapter()
      # end copy of chapter and add to a file

      # start goto next chapter if exists then return to copy chapter else Close()
      def NextChapter():
      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      a=0
      main = soup.find(class_='wrapper')
      for container in main.find_all(class_='container'):
      a+=1
      row = container.find(class_='row')
      b=0
      for chapterLink in row.find_all(class_='4u 12u(small)'):
      b+=1
      cLink = chapterLink.find('a').contents[0].strip()
      link = driver.find_element_by_link_text(cLink)
      link.click()
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
      global currentURL
      currentURL = driver.current_url
      if currentURL != lastURL:
      CopyChapter()
      else:
      print('Finished!!!')
      Close()
      # end goto next chapter if exists then return to copy chapter else Close()

      CopyChapter()
      #EOF


      Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.



      At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then please teach me.



      Windows 10
      Python 3.7.0



      There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.










      share|improve this question














      The print() command prints the scrapped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.



      Tried various forms of .encode(encoding='...',errors='...') to no avail.
      When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.



      If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.



      #! python3
      import os
      import os.path
      from os import path
      import requests
      import bs4 as BeautifulSoup
      from selenium import webdriver
      from selenium.webdriver.support.wait import WebDriverWait
      from selenium.webdriver.chrome.options import Options


      def Close():
      driver.stop_client()
      driver.close()
      driver.quit()

      CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'

      # start raw html
      NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
      BaseURL = 'https://novelplanet.com'
      url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}

      options = Options()
      options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
      #options.add_argument("--headless") # Runs Chrome in headless mode.
      #options.add_argument('--no-sandbox') # Bypass OS security model
      #options.add_argument('--disable-gpu') # applicable to windows os only
      options.add_argument('start-maximized') #
      options.add_argument('disable-infobars')
      #options.add_argument("--disable-extensions")
      driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
      driver.get(url)

      # wait for title not be equal to "Please wait 5 seconds..."
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      # End raw html

      # Start get first chapter html coded
      i=0
      for chapterLink in soup.find_all(class_='rowChapter'):
      i+=1
      cLink = chapterLink.find('a').contents[0].strip()
      print(driver.title)
      # end get first chapter html coded

      # start navigate to first chapter
      link = driver.find_element_by_link_text(cLink)
      link.click()
      # end navigate to first chapter

      # start copy of chapter and add to a file
      def CopyChapter():
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
      print(driver.title)
      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      readables = soup.find(id='divReadContent')
      text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
      name = driver.title
      file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
      print(file_name)
      #print(text) # <-- This shows the correct text in the shell with no errors
      with open(file_name,'a+') as file:
      print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
      global lastURL
      lastURL = driver.current_url
      NextChapter()
      # end copy of chapter and add to a file

      # start goto next chapter if exists then return to copy chapter else Close()
      def NextChapter():
      soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
      a=0
      main = soup.find(class_='wrapper')
      for container in main.find_all(class_='container'):
      a+=1
      row = container.find(class_='row')
      b=0
      for chapterLink in row.find_all(class_='4u 12u(small)'):
      b+=1
      cLink = chapterLink.find('a').contents[0].strip()
      link = driver.find_element_by_link_text(cLink)
      link.click()
      wait = WebDriverWait(driver, 10)
      wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
      global currentURL
      currentURL = driver.current_url
      if currentURL != lastURL:
      CopyChapter()
      else:
      print('Finished!!!')
      Close()
      # end goto next chapter if exists then return to copy chapter else Close()

      CopyChapter()
      #EOF


      Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.



      At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then please teach me.



      Windows 10
      Python 3.7.0



      There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.







      python-3.x web-scraping beautifulsoup selenium-chromedriver






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Dec 28 '18 at 18:11









      JamesJames

      196




      196
























          0






          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53962642%2fany-tricks-to-fix-encoding-problems-without-a-metric-ton-of-replace-python3%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53962642%2fany-tricks-to-fix-encoding-problems-without-a-metric-ton-of-replace-python3%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Monofisismo

          Angular Downloading a file using contenturl with Basic Authentication

          Olmecas