Any tricks to fix encoding problems without a metric-ton of .replace()? Python3 Chrome-Driver BS4

The print() command prints the scrapped website perfectly to the IDLE shell. However, write/writelines/print will not write to a file without throwing many encode errors or super-geek-squad code.

Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.

If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.

#! python3

import os

import os.path

from os import path

import requests

import bs4 as BeautifulSoup

from selenium import webdriver

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.chrome.options import Options





def Close():

    driver.stop_client()

    driver.close()

    driver.quit()



CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'



# start raw html

NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'

BaseURL = 'https://novelplanet.com'

url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}



options = Options()

options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])

#options.add_argument("--headless") # Runs Chrome in headless mode.

#options.add_argument('--no-sandbox') # Bypass OS security model

#options.add_argument('--disable-gpu')  # applicable to windows os only

options.add_argument('start-maximized') # 

options.add_argument('disable-infobars')

#options.add_argument("--disable-extensions")

driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)

driver.get(url)



# wait for title not be equal to "Please wait 5 seconds..."

wait = WebDriverWait(driver, 10)

wait.until(lambda driver: driver.title != "Please wait 5 seconds...")



soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

# End raw html



# Start get first chapter html coded

i=0

for chapterLink in soup.find_all(class_='rowChapter'):

    i+=1

cLink = chapterLink.find('a').contents[0].strip()

print(driver.title)

# end get first chapter html coded



# start navigate to first chapter

link = driver.find_element_by_link_text(cLink)

link.click()

# end navigate to first chapter



# start copy of chapter and add to a file

def CopyChapter():

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    print(driver.title)

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    readables = soup.find(id='divReadContent')

    text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')

    name = driver.title

    file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')

    print(file_name)

    #print(text) # <-- This shows the correct text in the shell with no errors

    with open(file_name,'a+') as file:

        print(text,file=file) # <- this never works without a bunch of .replace() where text is defined

    global lastURL

    lastURL = driver.current_url

    NextChapter()

# end copy of chapter and add to a file



# start goto next chapter if exists then return to copy chapter else Close()

def NextChapter():

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    a=0

    main = soup.find(class_='wrapper')

    for container in main.find_all(class_='container'):

        a+=1

    row = container.find(class_='row')

    b=0

    for chapterLink in row.find_all(class_='4u 12u(small)'):

        b+=1

    cLink = chapterLink.find('a').contents[0].strip()

    link = driver.find_element_by_link_text(cLink)

    link.click()

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    global currentURL

    currentURL = driver.current_url

    if currentURL != lastURL:

        CopyChapter()

    else:

        print('Finished!!!')

        Close()

# end goto next chapter if exists then return to copy chapter else Close()



CopyChapter()

#EOF

Expected results would have the Text file output exactly the same as the IDLE print(text) with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.

At the current time unless I keep adding more and more .replace() for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace() but if there is also a better way to do that then please teach me.

Windows 10
Python 3.7.0

There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.

asked Dec 28 '18 at 18:11

James

196

add a comment |

Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.

If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.

#! python3

import os

import os.path

from os import path

import requests

import bs4 as BeautifulSoup

from selenium import webdriver

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.chrome.options import Options





def Close():

    driver.stop_client()

    driver.close()

    driver.quit()



CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'



# start raw html

NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'

BaseURL = 'https://novelplanet.com'

url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}



options = Options()

options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])

#options.add_argument("--headless") # Runs Chrome in headless mode.

#options.add_argument('--no-sandbox') # Bypass OS security model

#options.add_argument('--disable-gpu')  # applicable to windows os only

options.add_argument('start-maximized') # 

options.add_argument('disable-infobars')

#options.add_argument("--disable-extensions")

driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)

driver.get(url)



# wait for title not be equal to "Please wait 5 seconds..."

wait = WebDriverWait(driver, 10)

wait.until(lambda driver: driver.title != "Please wait 5 seconds...")



soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

# End raw html



# Start get first chapter html coded

i=0

for chapterLink in soup.find_all(class_='rowChapter'):

    i+=1

cLink = chapterLink.find('a').contents[0].strip()

print(driver.title)

# end get first chapter html coded



# start navigate to first chapter

link = driver.find_element_by_link_text(cLink)

link.click()

# end navigate to first chapter



# start copy of chapter and add to a file

def CopyChapter():

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    print(driver.title)

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    readables = soup.find(id='divReadContent')

    text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')

    name = driver.title

    file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')

    print(file_name)

    #print(text) # <-- This shows the correct text in the shell with no errors

    with open(file_name,'a+') as file:

        print(text,file=file) # <- this never works without a bunch of .replace() where text is defined

    global lastURL

    lastURL = driver.current_url

    NextChapter()

# end copy of chapter and add to a file



# start goto next chapter if exists then return to copy chapter else Close()

def NextChapter():

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    a=0

    main = soup.find(class_='wrapper')

    for container in main.find_all(class_='container'):

        a+=1

    row = container.find(class_='row')

    b=0

    for chapterLink in row.find_all(class_='4u 12u(small)'):

        b+=1

    cLink = chapterLink.find('a').contents[0].strip()

    link = driver.find_element_by_link_text(cLink)

    link.click()

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    global currentURL

    currentURL = driver.current_url

    if currentURL != lastURL:

        CopyChapter()

    else:

        print('Finished!!!')

        Close()

# end goto next chapter if exists then return to copy chapter else Close()



CopyChapter()

#EOF

Windows 10
Python 3.7.0

There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.

asked Dec 28 '18 at 18:11

James

196

add a comment |

Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.

If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.

#! python3

import os

import os.path

from os import path

import requests

import bs4 as BeautifulSoup

from selenium import webdriver

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.chrome.options import Options





def Close():

    driver.stop_client()

    driver.close()

    driver.quit()



CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'



# start raw html

NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'

BaseURL = 'https://novelplanet.com'

url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}



options = Options()

options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])

#options.add_argument("--headless") # Runs Chrome in headless mode.

#options.add_argument('--no-sandbox') # Bypass OS security model

#options.add_argument('--disable-gpu')  # applicable to windows os only

options.add_argument('start-maximized') # 

options.add_argument('disable-infobars')

#options.add_argument("--disable-extensions")

driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)

driver.get(url)



# wait for title not be equal to "Please wait 5 seconds..."

wait = WebDriverWait(driver, 10)

wait.until(lambda driver: driver.title != "Please wait 5 seconds...")



soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

# End raw html



# Start get first chapter html coded

i=0

for chapterLink in soup.find_all(class_='rowChapter'):

    i+=1

cLink = chapterLink.find('a').contents[0].strip()

print(driver.title)

# end get first chapter html coded



# start navigate to first chapter

link = driver.find_element_by_link_text(cLink)

link.click()

# end navigate to first chapter



# start copy of chapter and add to a file

def CopyChapter():

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    print(driver.title)

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    readables = soup.find(id='divReadContent')

    text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')

    name = driver.title

    file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')

    print(file_name)

    #print(text) # <-- This shows the correct text in the shell with no errors

    with open(file_name,'a+') as file:

        print(text,file=file) # <- this never works without a bunch of .replace() where text is defined

    global lastURL

    lastURL = driver.current_url

    NextChapter()

# end copy of chapter and add to a file



# start goto next chapter if exists then return to copy chapter else Close()

def NextChapter():

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    a=0

    main = soup.find(class_='wrapper')

    for container in main.find_all(class_='container'):

        a+=1

    row = container.find(class_='row')

    b=0

    for chapterLink in row.find_all(class_='4u 12u(small)'):

        b+=1

    cLink = chapterLink.find('a').contents[0].strip()

    link = driver.find_element_by_link_text(cLink)

    link.click()

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    global currentURL

    currentURL = driver.current_url

    if currentURL != lastURL:

        CopyChapter()

    else:

        print('Finished!!!')

        Close()

# end goto next chapter if exists then return to copy chapter else Close()



CopyChapter()

#EOF

Windows 10
Python 3.7.0

There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.

asked Dec 28 '18 at 18:11

James

196

Tried various forms of .encode(encoding='...',errors='...') to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.

If I wanted to spend 10 years doing .replace('...','...'), as shown in the code of text = ... I can get this to completely work.

#! python3

import os

import os.path

from os import path

import requests

import bs4 as BeautifulSoup

from selenium import webdriver

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.chrome.options import Options





def Close():

    driver.stop_client()

    driver.close()

    driver.quit()



CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'



# start raw html

NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'

BaseURL = 'https://novelplanet.com'

url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}



options = Options()

options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])

#options.add_argument("--headless") # Runs Chrome in headless mode.

#options.add_argument('--no-sandbox') # Bypass OS security model

#options.add_argument('--disable-gpu')  # applicable to windows os only

options.add_argument('start-maximized') # 

options.add_argument('disable-infobars')

#options.add_argument("--disable-extensions")

driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)

driver.get(url)



# wait for title not be equal to "Please wait 5 seconds..."

wait = WebDriverWait(driver, 10)

wait.until(lambda driver: driver.title != "Please wait 5 seconds...")



soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

# End raw html



# Start get first chapter html coded

i=0

for chapterLink in soup.find_all(class_='rowChapter'):

    i+=1

cLink = chapterLink.find('a').contents[0].strip()

print(driver.title)

# end get first chapter html coded



# start navigate to first chapter

link = driver.find_element_by_link_text(cLink)

link.click()

# end navigate to first chapter



# start copy of chapter and add to a file

def CopyChapter():

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    print(driver.title)

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    readables = soup.find(id='divReadContent')

    text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')

    name = driver.title

    file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')

    print(file_name)

    #print(text) # <-- This shows the correct text in the shell with no errors

    with open(file_name,'a+') as file:

        print(text,file=file) # <- this never works without a bunch of .replace() where text is defined

    global lastURL

    lastURL = driver.current_url

    NextChapter()

# end copy of chapter and add to a file



# start goto next chapter if exists then return to copy chapter else Close()

def NextChapter():

    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')

    a=0

    main = soup.find(class_='wrapper')

    for container in main.find_all(class_='container'):

        a+=1

    row = container.find(class_='row')

    b=0

    for chapterLink in row.find_all(class_='4u 12u(small)'):

        b+=1

    cLink = chapterLink.find('a').contents[0].strip()

    link = driver.find_element_by_link_text(cLink)

    link.click()

    wait = WebDriverWait(driver, 10)

    wait.until(lambda driver: driver.title != "Please wait 5 seconds...")

    global currentURL

    currentURL = driver.current_url

    if currentURL != lastURL:

        CopyChapter()

    else:

        print('Finished!!!')

        Close()

# end goto next chapter if exists then return to copy chapter else Close()



CopyChapter()

#EOF

Windows 10
Python 3.7.0

There was some reason for os and os.path in an earlier version of this script but now I don't remember if it is still needed or not.

python-3.x web-scraping beautifulsoup selenium-chromedriver

asked Dec 28 '18 at 18:11

James

196

asked Dec 28 '18 at 18:11

James

196

asked Dec 28 '18 at 18:11

James

196

asked Dec 28 '18 at 18:11

James

196

asked Dec 28 '18 at 18:11

James

196

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53962642%2fany-tricks-to-fix-encoding-problems-without-a-metric-ton-of-replace-python3%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

Br,FY RYi1cRTHcyuEWvm QfLctKoEtqzZWP4qETLpZ 5asnVMwd2TfOwtq2kl6,dP71qR2HswLL0BY4Iy kBC3uQNEr CJMb

搜尋此網誌

Bdtjtk