Any tricks to fix encoding problems without a metric-ton of .replace()? Python3 Chrome-Driver BS4
The print()
command prints the scrapped website perfectly to the IDLE shell. However, write
/writelines
/print
will not write to a file without throwing many encode errors or super-geek-squad code.
Tried various forms of .encode(encoding='...',errors='...')
to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.
If I wanted to spend 10 years doing .replace('...','...')
, as shown in the code of text = ...
I can get this to completely work.
#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
def Close():
driver.stop_client()
driver.close()
driver.quit()
CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'
# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}
options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html
# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded
# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter
# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
Expected results would have the Text file output exactly the same as the IDLE print(text)
with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.
At the current time unless I keep adding more and more .replace()
for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace()
but if there is also a better way to do that then please teach me.
Windows 10
Python 3.7.0
There was some reason for os
and os.path
in an earlier version of this script but now I don't remember if it is still needed or not.
python-3.x web-scraping beautifulsoup selenium-chromedriver
add a comment |
The print()
command prints the scrapped website perfectly to the IDLE shell. However, write
/writelines
/print
will not write to a file without throwing many encode errors or super-geek-squad code.
Tried various forms of .encode(encoding='...',errors='...')
to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.
If I wanted to spend 10 years doing .replace('...','...')
, as shown in the code of text = ...
I can get this to completely work.
#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
def Close():
driver.stop_client()
driver.close()
driver.quit()
CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'
# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}
options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html
# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded
# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter
# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
Expected results would have the Text file output exactly the same as the IDLE print(text)
with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.
At the current time unless I keep adding more and more .replace()
for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace()
but if there is also a better way to do that then please teach me.
Windows 10
Python 3.7.0
There was some reason for os
and os.path
in an earlier version of this script but now I don't remember if it is still needed or not.
python-3.x web-scraping beautifulsoup selenium-chromedriver
add a comment |
The print()
command prints the scrapped website perfectly to the IDLE shell. However, write
/writelines
/print
will not write to a file without throwing many encode errors or super-geek-squad code.
Tried various forms of .encode(encoding='...',errors='...')
to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.
If I wanted to spend 10 years doing .replace('...','...')
, as shown in the code of text = ...
I can get this to completely work.
#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
def Close():
driver.stop_client()
driver.close()
driver.quit()
CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'
# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}
options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html
# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded
# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter
# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
Expected results would have the Text file output exactly the same as the IDLE print(text)
with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.
At the current time unless I keep adding more and more .replace()
for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace()
but if there is also a better way to do that then please teach me.
Windows 10
Python 3.7.0
There was some reason for os
and os.path
in an earlier version of this script but now I don't remember if it is still needed or not.
python-3.x web-scraping beautifulsoup selenium-chromedriver
The print()
command prints the scrapped website perfectly to the IDLE shell. However, write
/writelines
/print
will not write to a file without throwing many encode errors or super-geek-squad code.
Tried various forms of .encode(encoding='...',errors='...')
to no avail.
When I tried many different encodings they would turn into super-geek-squad formats or multiple ?'s inside the text file.
If I wanted to spend 10 years doing .replace('...','...')
, as shown in the code of text = ...
I can get this to completely work.
#! python3
import os
import os.path
from os import path
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
def Close():
driver.stop_client()
driver.close()
driver.quit()
CHROMEDRIVER_PATH = 'E:Downloadschromedriver_win32chromedriver.exe'
# start raw html
NovelName = 'Novel/Isekai-Maou-to-Shoukan-Shoujo-Dorei-Majutsu'
BaseURL = 'https://novelplanet.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelName}
options = Options()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
#options.add_argument("--headless") # Runs Chrome in headless mode.
#options.add_argument('--no-sandbox') # Bypass OS security model
#options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
#options.add_argument("--disable-extensions")
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
# wait for title not be equal to "Please wait 5 seconds..."
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
# End raw html
# Start get first chapter html coded
i=0
for chapterLink in soup.find_all(class_='rowChapter'):
i+=1
cLink = chapterLink.find('a').contents[0].strip()
print(driver.title)
# end get first chapter html coded
# start navigate to first chapter
link = driver.find_element_by_link_text(cLink)
link.click()
# end navigate to first chapter
# start copy of chapter and add to a file
def CopyChapter():
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
print(driver.title)
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='divReadContent')
text = readables.text.strip().replace('混','').replace('魔','').replace('族','').replace('デ','').replace('イ','').replace('ー','').replace('マ','').replace('ン','').replace('☆','').replace('ッ','Uh').replace('『','[').replace('』',']').replace('“','"').replace('”','"').replace('…','...').replace('ー','-').replace('○','0').replace('×','x').replace('《',' <<').replace('》','>> ').replace('「','"').replace('」','"')
name = driver.title
file_name = (name.replace('Read ',"").replace(' - NovelPlanet',"")+'.txt')
print(file_name)
#print(text) # <-- This shows the correct text in the shell with no errors
with open(file_name,'a+') as file:
print(text,file=file) # <- this never works without a bunch of .replace() where text is defined
global lastURL
lastURL = driver.current_url
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
a=0
main = soup.find(class_='wrapper')
for container in main.find_all(class_='container'):
a+=1
row = container.find(class_='row')
b=0
for chapterLink in row.find_all(class_='4u 12u(small)'):
b+=1
cLink = chapterLink.find('a').contents[0].strip()
link = driver.find_element_by_link_text(cLink)
link.click()
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.title != "Please wait 5 seconds...")
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
Expected results would have the Text file output exactly the same as the IDLE print(text)
with absolutely no changes. Then I would be able to test if every chapter gets copied for offline viewing and that it stops at the last chapter posted.
At the current time unless I keep adding more and more .replace()
for every novel and chapter this won't ever be working properly. I wouldn't mind manually removing the Ad descriptions by using .replace()
but if there is also a better way to do that then please teach me.
Windows 10
Python 3.7.0
There was some reason for os
and os.path
in an earlier version of this script but now I don't remember if it is still needed or not.
python-3.x web-scraping beautifulsoup selenium-chromedriver
python-3.x web-scraping beautifulsoup selenium-chromedriver
asked Dec 28 '18 at 18:11
JamesJames
196
196
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53962642%2fany-tricks-to-fix-encoding-problems-without-a-metric-ton-of-replace-python3%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53962642%2fany-tricks-to-fix-encoding-problems-without-a-metric-ton-of-replace-python3%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown