Hi
How could I use/dev an extension to get the source code (html/plain text) of a website where i'm browsing?
Regards
Hi
How could I use/dev an extension to get the source code (html/plain text) of a website where i'm browsing?
Regards
You could use the script I described earlier. The appex extension gets the current web page and stores it in a folder within Pythonista for you to use,, much like Open in Pythonista.
Thanks a lot, Olaf. I'm going to check it.
It does not work with frames ... any other way?
@Pepo did you mean it doesn't work with an html
Grabbing the http code of a page won't give you the iframe code because that content is from an external source.
If you're after that, I suggest using Beautiful soup (bs4) to find the iframe, get the link, then get the code from that link. You may also need to fix the link....for example, "../stuff/more stuff.html" will need to have the rest of the URL--"http://www.google.com/stuff/morestuff.html"
If there is more than one iframe you may need to select from a list which one you want (if it's not easily identifiable with bs4)
I'm also kind of assuming there is a particular webpage that you want to get the iframe code from. I could be wrong...!
Here's a start:
I've been trying to learn more of bs4 and so this is a it of practice for me too...
# coding: utf-8
import requests
from bs4 import BeautifulSoup
def make_soup(url):
#in=url out=beautiful soup object
return BeautifulSoup(requests.get(url).text)
def get_iframe_src_html(soup):
#give beautiful soup object, return html of iframe source
#this assumes only one iframe:
src_url = str(soup.find('iframe')['src'])
#todo: code for more than one iframe in page use case
if 'http' not in src_url:
#todo: code to attach url root
pass
return requests.get(src_url).text
#example:
example_page = '<html><body><iframe src="http://www.google.com"></iframe><body>'
soup = BeautifulSoup(example_page)
print get_iframe_src_html(soup)
#example usage:
#soup = make_soup('http://www.google.com')
#iframe_html = get_iframe_src_html(soup)
If l correctly understand your request, here a script to view and eventually save th Source code of an URL.
# coding: utf-8
import appex
import requests
import os
import ui
import console
import webbrowser
import clipboard
import urllib
import urlparse
import time
def path2url(path):
w = os.path.join(os.path.expanduser("~/Documents"),path )
return urlparse.urljoin('file:', urllib.pathname2url(w))
class MyView(ui.View):
def will_close(self):
pass
def oui_action(sender):
global button_pressed,file_txt,file_html
os.rename(file_txt,file_html)
button_pressed = True
def non_action(sender):
global button_pressed,file_txt
os.remove(file_txt)
button_pressed = True
def main():
global button_pressed,file_txt,file_html
console.clear()
if appex.is_running_extension():
url = appex.get_url()
else:
url = clipboard.get()
if url == None:
console.alert('Nothing in the ClipBoard','','Ok',hide_cancel_button=True)
return
if url[:7] <> 'http://' and url[:8] <> 'https://':
console.alert('ClipBoard does not contain a valid URL','','Ok',hide_cancel_button=True)
return
# Webview to display conversion site
x = 0
y = 0
w = back.width
h = back.height - 32 - 2*10
web = ui.WebView(name='web',frame=(x,y,w,h))
web.border_color = 'blue'
web.border_width = 1
back.add_subview(web)
# Label to display progress
titlbl = ui.Label(name='titlbl')
titlbl.width = back.width - 80*2 - 10*4
titlbl.height = 32
titlbl.x = 80 + 10*2
titlbl.y = web.y + web.height + 10
titlbl.text = ''
titlbl.alignment = ui.ALIGN_CENTER
titlbl.font= ('Courier-Bold',20)
titlbl.text_color = 'black'
back.add_subview(titlbl)
# Button: yes
oui_button = ui.Button()
oui_button.border_color = 'black'
oui_button.border_width = 1
oui_button.width = 80
oui_button.height = 32
oui_button.x = web.x + web.width - 80 - 10
oui_button.y = titlbl.y
oui_button.title = 'yes'
oui_button.alignment = ui.ALIGN_CENTER
oui_button.font = ('Courier',20)
oui_button.text_color = 'black'
oui_button.hidden = False
oui_button.action = oui_action
back.add_subview(oui_button)
# Button: non
non_button = ui.Button()
non_button.border_color = 'black'
non_button.border_width = 1
non_button.width = 80
non_button.height = 32
non_button.x = 10
non_button.y = titlbl.y
non_button.title = 'Non'
non_button.alignment = ui.ALIGN_CENTER
non_button.font = ('Courier',20)
non_button.text_color = 'black'
non_button.hidden = False
non_button.action = non_action
back.add_subview(non_button)
# Read page contents
r = requests.get(url)
source = r.text
ct = r.headers['Content-Type']
extension = '.html' if ct.startswith('text/html') else '.txt'
# Where to save the source
filename='View-OpenPageSource'
file_txt = os.path.abspath(filename+'.txt')
file_html = os.path.abspath(filename+'.html')
# Save the source
with open(file_txt,'w') as f:
f.write(source)
# Display the source
web.load_url(path2url(file_txt))
# Ask if source file to be kept
titlbl.text = 'Keep the souce file?'
# loop button not pressed
button_pressed = False
while not button_pressed:
time.sleep(0.5)
back.close()
# Normally called by sharing action in Safari, but could be called by Launcher and passing url via clipboard
# Hide script
back = MyView()
back.background_color='white'
back.name = 'View/Open Page Source'
if appex.is_running_extension():
disp_mode = 'sheet'
else:
disp_mode = 'full_screen'
back.present(disp_mode,hide_title_bar=False)
# check if the script is running instead of be imported
if __name__ == '__main__':
main()
if appex.is_running_extension():
appex.finish()
else:
# Back to home screen
webbrowser.open('launcher://crash')
import urllib2
html_file_url = "" # set your file's url here
download_to = "" # set the name of the file you want to save the html to
open(download_to,"w").write(urllib2.url_open(html_file_url).read())
@cvp An alternative could be:
if url[:7] <> 'http://' and url[:8] <> 'https://':
# -->
if not url.startswith(('http://', 'https://')):
<> is deprecated in Python2 and removed in Python3. Use != instead.
@AtomBombed It is not recommended to open files without closing them. http://stackoverflow.com/questions/7395542/is-explicitly-closing-files-important
with open(download_to, "w") as out_file: # will automatically close()
out_file.write(urllib2.url_open(html_file_url).read())
@ccc thanks, I always forget this !=...
@ccc forgot to put that in at the end. I normally do with my own code.
So I know the question has been answered, but I just wanted to share this. It's just another approach.
import ui, appex
class wvdelegate(object):
def webview_did_finish_load(self, webview):
html = webview.eval_js('document.documentElement.innerHTML')
webview.load_html('<xmp>' + html + r'<\xmp>')
webview.delegate = None
wv = ui.WebView()
wv.load_url(appex.get_url())
wv.delegate = wvdelegate()
wv.present()
You could also do like this to copy the HTML
import ui, appex, clipboard
class wvdelegate(object):
def webview_did_finish_load(self, webview):
self.html = webview.eval_js('document.documentElement.innerHTML')
webview.load_html('<xmp>' + self.html + r'<\xmp>')
webview.delegate.webview_did_finish_load = None
wv.right_button_items = [ui.ButtonItem(image=ui.Image('iob:clipboard_32'), action=lambda x: clipboard.set(wv.delegate.html))]
wv = ui.WebView()
wv.load_url(appex.get_url())
wv.delegate = wvdelegate()
wv.present()