def get_value(a):
return a[1:a.find(">")-1]
hrefs = map(get_value,open("hypertext.html","r").read().split("<a href="))
braydon@bgf:~/python_tests/extract$ time python split.py real 0m1.263s user 0m1.112s sys 0m0.156s braydon@bgf:~/python_tests/extract$ time python split_2.py real 0m0.392s user 0m0.268s sys 0m0.120s
def is_ahref(a,b):
y = b.find("<a href=")
if y != -1: a.append(b[y+9:-1])
return a
def preduce(fn,ls,a):
ls.insert(0,a)
return reduce(fn,ls)
hrefs = preduce(is_ahref,open("hypertext.html","r").read().split(">"),[])
There is a better solution. Splitting the text up by the ">" character is wasteful; there are many ">"s in html, and most of them that will not have hyperlinks. We don't need to even check if the item in the list is an href if we split the string into a list that all will have an href, and then reduce it as before.
def is_ahref(a,b):
z = b.find(">")
a.append(b[1:z-1])
return a
def preduce(fn,ls,a):
ls.insert(0,a)
return reduce(fn,ls)
hrefs = preduce(is_ahref,open("hypertext.html","r").read().split("<a href="),[])
However because the size of the list will be exactly the same as it started, we shouldn't need to use reduce(), but rather we can just map() a fuction to run through the entire list, 'reducing' it to a list of just the hyperlinks.
def get_value(a):
return a[1:a.find(">")-1]
hrefs = map(get_value,open("hypertext.html","r").read().split("<a href="))
Sat Jun 16, 2008
f = open('hypertext_sm.html','r')
ahrefs = []
count = []
def find_ahref(h):
a = h.find("<a href=")
if a != -1:
a = a+9
b = a + h[a:-1].find(">")-1
ahrefs.append(h[a:b])
find_ahref(h[b:-1])
find_ahref(f.read())
f.close()
f = open('hypertext_sm.html','r')
def find_ahref(a,b):
try:
c = a[1] + str(b)
x = c.find("<a href=")
y = c[x:-1].find(">")
if x != -1 and y != -1:
a[0].append(c[x+9:x+y-1])
return (a[0],"")
else:
return (a[0],c)
except:
return ([],str(a)+str(b))
hrefs = reduce(find_ahref,f.read())[0]
f.close()
f = open('hypertext.html','r')
ahrefs = []
def find_ahref(h):
a = h.find("<a href=")
if a != -1:
a = a+9
b = a + h[a:-1].find(">")-1
ahrefs.append(h[a:b])
find_ahref(h[b:-1])
while True:
line = f.readline()
if line:
find_ahref(line)
else:
break
f.close()
f = open('hypertext.html','r')
def ahref_reducer(a,b):
try:
c = a[1] + str(b)
x = c.find("<a href=")
y = c[x:-1].find(">")
if x != -1 and y != -1:
a[0].append(c[x+9:x+y-1])
return (a[0],"")
else:
return (a[0],c)
except:
return ([],str(a)+str(b))
def get_ahrefs(line):
return reduce(ahref_reducer,line)[0]
lines = f.readlines()
hrefs = map(get_ahrefs,lines)
f.close()
f = open('hypertext.html','r')
def find_ahref(z,htxt):
a = htxt.find("<a href=")
if a != -1:
a = a+9
b = a+htxt[a:-1].find(">")-1
href = htxt[a:b]
z.append(href)
return find_ahref(z,htxt[b:-1])
else:
return z
def preduce(fn,ls,a):
ls.insert(0,a)
return reduce(fn,ls)
lines = f.readlines()
hrefs = preduce(find_ahref,lines,[])
f.close()
hrefs = []
f = open("hypertext.html","r")
f_str = f.read()
f.close()
while True:
b = f_str.find(">")
if b == -1:
break
a = f_str[0:b].find("<a href=")
if a != -1:
hrefs.append(f_str[a+9:b-1])
f_str = f_str[b+1:-1]
def is_ahref(a,b):
y = b.find("<a href=")
if y != -1: a.append(b[y+9:-1])
return a
def preduce(fn,ls,a):
ls.insert(0,a)
return reduce(fn,ls)
hrefs = preduce(is_ahref,open("hypertext.html","r").read().split(">"),[])
Sat Jun 14, 2008
#import a list of 247,213 integers
from oids import oids
def pre(a):
return (list(), a, 0)
def make_sets(a,b):
set_size = 8
if a[2] == set_size - 1 or a[0] == list():
a[0].append([a[1]])
return (a[0],b[1],0)
else:
a[0][-1].append(a[1])
return (a[0],b[1],a[2]+1)
reduce(make_sets,map(pre,oids))
real 0m0.935s user 0m0.896s sys 0m0.032s real 0m0.948s user 0m0.920s sys 0m0.028s real 0m0.929s user 0m0.900s sys 0m0.024s
from oids import oids
output = list()
count = 0
set_size = 8
for oid in oids:
if count == set_size or output == list():
output.append([oid])
count = 0
else:
output[-1].append(oid)
count = count + 1
real 0m0.429s user 0m0.404s sys 0m0.024s real 0m0.396s user 0m0.384s sys 0m0.012s real 0m0.410s user 0m0.396s sys 0m0.012s
from oids import oids
output = list()
set_size = 8
count = [0]
def break_apart(a):
if count[-1] == set_size or output == list():
output.append([a])
count.append(0)
else:
output[-1].append(a)
count.append(count[-1] + 1)
map(break_apart,oids)
real 0m0.484s user 0m0.476s sys 0m0.012s real 0m0.483s user 0m0.464s sys 0m0.016s real 0m0.482s user 0m0.452s sys 0m0.028s
from oids import oids
def seperate(a,b,length=8):
try:
if len(a[-1]) == length:
a.append([b])
return a
else:
a[-1].append(b)
return a
except:
return [[a,b]]
oids = reduce(seperate,oids)
real 0m0.323s user 0m0.308s sys 0m0.016s real 0m0.329s user 0m0.300s sys 0m0.028s real 0m0.353s user 0m0.332s sys 0m0.020s