My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#
# Title: htmlizer script, prepare HTML for Drupal publishing
# Author: Sebastien Lelong, Copyright (c) 2008, all rights reserved.
# Adapted-by:
# Compiler:
#
# This file is part of jallib (http://jallib.googlecode.com)
# Released under the BSD license (http://www.opensource.org/licenses/bsd-license.php)
#
# Sources:
#
# Description: this script takes a HTML file as input, keeps "body" element's content
# and convert some URL to make them compatible with Drupal's. Output is printed in a
# "content" file, in a "topublish" directory, within the one containing original
# HTML files. It also pick used images from orignal "src" location, and put them in
# an directory (so they are selected and ready for publishing)
#


import sys, os, re
import urlparse
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

# we'll put images and attachments in general in this directory
DRUPAL_IMG_PATH_PREFIX = "/sites/default/files/"
# url aliasing by default prefixes with "content/", but we can define URLs we want
DRUPAL_CONTENT_PREFIX = "/content/"
# where to put all stuff to publish
OUTPUT_DIR="topublish"
ATTACH_DIR="attachments"
# Filename for filtered HTML
CONTENT_FILE = "content"
TITLE_FILE = "title"
PATH_FILE = "path"


try:
conffile = sys.argv[1]
hfile = sys.argv[2]
except IndexError:
print >> sys.stderr, "Please provide a config file and a HTML file as input"
sys.exit(255)

conffile = conffile.replace(".py","")
exec("import %s as japp_config" % conffile)
JAPP_CONTEXT_URL = japp_config.JAPP_CONTEXT_URL

# prepare ouput directory
dirn = os.path.dirname(hfile)
basen = os.path.basename(hfile)
os.system("rm -f %s/%s/*" % (dirn,OUTPUT_DIR))
os.system("mkdir -p %s/%s/%s/" % (dirn,OUTPUT_DIR,ATTACH_DIR))


html = BeautifulSoup(file(hfile).read())

# extract body content
body = html.findAll("body")
if len(body) == 0:
print >> sys.stderr, "No <body> element found, assuming content corresponds to inner body"
body = html
elif len(body) > 1:
print >> sys.stderr, "More than one <body> element found, c'mon that's impossible, are you joking with me buddy ?..."
sys.exit(1)
else:
body = body[0]

# convert image URL (src attr) and copy them in ouput directory.
# Why ? Because when DITA compiler processes a DITA file, it resolves
# outer DITA topic, with their images. So you finally get more than needed...
# I, Seb, can't find an option in compiler to just produce HTML for one file
# (no outer topic) *and* still copy related images (I can do the first, without
# the second, using onlytopic.in.map=true, but the images don't get copied...)
imgs = body.findAll("img")
enclosingas = []
for img in imgs:
origsrc = img['src']
imgfn = os.path.basename(origsrc)
img['src'] = DRUPAL_IMG_PATH_PREFIX + imgfn
# is it enclosed by a <A> element ? (clickable ?)
if img.parent.name == "a" and img.parent.get("href") == origsrc:
img.parent['href'] = img['src']
enclosingas.append(img.parent)
# fix relative path (quite dirty...)
origsrc = origsrc.replace(os.path.pardir + os.path.sep,"")
os.system("cp %s/%s %s/%s/%s/" % (dirn,origsrc,dirn,OUTPUT_DIR,ATTACH_DIR))


# convert link URL
pat = re.compile("\.html",re.I)
noext = pat.sub("",basen)
as_ = body.findAll("a")
for a in as_:
# if <a> element encloses images (makes image clickable) skip it,
# path was adjusted step before
if a in enclosingas:
continue
try:
href = a['href']
scheme, netloc, path, params, query, fragment = urlparse.urlparse(href)
if scheme:
# external link, skip it
continue
if not path:
# must just an anchor, skip it
continue
# if we get here, url is a local file
# remove html suffix, as we'll remove it in drupal
path = pat.sub("",path)
# also remove prefix, just keep last part of URL correspoding to XML file
path = path.split("/")[-1]
# add prefix for Drupal's content
path = DRUPAL_CONTENT_PREFIX + JAPP_CONTEXT_URL + "/" + path

# back to <a> element
a['href'] = urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

except KeyError:
# no href, skip it
continue

# remove title, because title will be put in mail's subject. So leaving it here would
# produce twice title
h1s = body.findAll("h1",)
if not h1s:
print >> sys.stderr, "No <h1> element found, title will correspond to filename '%s'" % basen
title = basen
else:
# there should be only one element, but anyway remove first one
# only keep text content, not potential inner elements
# title will be used as email subject, not HTML content anymore
# so we need to convert HTML entities
quoted = h1s[0].findAll(text=True)[0]
title = BeautifulStoneSoup(quoted,convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0]
h1s[0].replaceWith("")

# we're done
fout = file("%s/%s/%s" % (dirn,OUTPUT_DIR,CONTENT_FILE),"w")
fout.write(body.renderContents())
fout.close()

fout = file("%s/%s/%s" % (dirn,OUTPUT_DIR,TITLE_FILE),"w")
fout.write(title)
fout.close()

fout = file("%s/%s/%s" % (dirn,OUTPUT_DIR,PATH_FILE),"w")
fout.write(JAPP_CONTEXT_URL + "/" + noext)
fout.close()

Change log

r2181 by sebastien.lelong on Aug 11, 2010   Diff
JAPP: fix page title, converting HTML
entities as title is used as email
subject. Add some more step-by-step
documentation
Go to: 
Project members, sign in to write a code review

Older revisions

r1592 by sebastien.lelong on Jan 14, 2010   Diff
add context URL to prevent URL
collision when publishing content
r1491 by sebastien.lelong on Nov 21, 2009   Diff
fix: remove path parts when deeper
than one
r1448 by sebastien.lelong on Nov 1, 2009   Diff
dirty fix for relative image path
above current directory (../)
All revisions of this file

File info

Size: 5258 bytes, 149 lines
Powered by Google Project Hosting