forked from SavinaRoja/OpenAccess_EPUB
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
255 lines (223 loc) · 9.85 KB
/
Copy pathutils.py
File metadata and controls
255 lines (223 loc) · 9.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
'''utility/common stuff'''
import os.path
import zipfile
from collections import namedtuple
Identifier = namedtuple('Identifer', 'id, type')
def makeEPUBBase(location, css_location):
'''Contains the functionality to create the ePub directory hierarchy from
scratch. Typical practice will not require this method, but use this to
replace the default base ePub directory if it is not present. It may also
used as a primer on ePub directory construction:
base_epub/
base_epub/mimetype
base_epub/META-INF/
base_epub/META-INF/container.xml
base_epub/OPS/
base_epub/OPS/css
base_epub/OPS/css/article.css
base_epub/OPS/images/
base_epub/OPS/images/equations/
base_epub/OPS/images/figures/
base_epub/OPS/images/tables/'''
#Create root directory
rootname = location
os.mkdir(rootname)
#Create mimetype file in root directory
mime_path = os.path.join(rootname, 'mimetype')
with open(mime_path, 'w') as mimetype:
mimetype.write('application/epub+zip')
#Create OPS and META-INF directorys
os.mkdir(os.path.join(rootname, 'META-INF'))
os.mkdir(os.path.join(rootname, 'OPS'))
#Create container.xml file in META-INF
meta_path = os.path.join(rootname, 'META-INF', 'container.xml')
with open(meta_path, 'w') as container:
container.write('''<?xml version="1.0" encoding="UTF-8" ?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>''')
#It is considered better practice to leave the instantiation of image
#directories up to other methods. Such directories are technically
#optional and may depend on content
#Create the css directory in OPS, then copy the file from resources
os.mkdir(os.path.join(rootname, 'OPS', 'css'))
css_path = os.path.join(rootname, 'OPS', 'css', 'article.css')
with open(css_path, 'w') as css_out:
with open(css_location, 'r') as css_src:
css_out.write(css_src.read())
def createDCElement(document, name, data, attributes = None):
'''A convenience method for creating DC tag elements.
Used in content.opf'''
newnode = document.createElement(name)
newnode.appendChild(document.createTextNode(data))
if attributes:
for attr, attrval in attributes.iteritems():
newnode.setAttribute(attr, attrval)
return newnode
def stripDOMLayer(oldnodelist, depth = 1):
'''This method strips layers \"off the top\" from a specified NodeList or
Node in the DOM. All child Nodes below the stripped layers are returned as
a NodeList, treating them as siblings irrespective of the original
hierarchy. To be used with caution. '''
newnodelist = []
while depth:
try:
for child in oldnodelist:
newnodelist += child.childNodes
except TypeError:
newnodelist = oldnodelist.childNodes
depth -= 1
newnodelist = stripDOMLayer(newnodelist, depth)
return newnodelist
return oldnodelist
def serializeText(fromnode, stringlist = [], sep = u''):
'''Recursively extract the text data from a node and it's children'''
for item in fromnode.childNodes:
if item.nodeType == item.TEXT_NODE and not item.data == u'\n':
stringlist.append(item.data)
else:
serializeText(item, stringlist, sep)
return sep.join(stringlist)
def getTagText(node):
'''Grab the text data from a Node. If it is provided a NodeList, it will
return the text data from the first contained Node.'''
data = u''
try:
children = node.childNodes
except AttributeError:
getTagText(node[0])
else:
if children:
for child in children:
if child.nodeType == child.TEXT_NODE and not child.data == u'\n':
data = child.data
return data
def getFormattedNode(node):
'''This method is called on a Node whose children may include emphasis
elements. The contained emphasis elements will be converted to ePub-safe
emphasis elements. Non-emphasis elements will be untouched.'''
#Some of these elements are to be supported through CSS
emphasis_elements = [u'bold', u'italic', u'monospace', u'overline',
u'sc', u'strike', u'underline']
spans = {u'monospace': u'font-family:monospace',
u'overline': u'text-decoration:overline',
u'sc': u'font-variant:small-caps',
u'strike': u'text-decoration:line-through',
u'underline': u'text-decoration:underline'}
clone = node.cloneNode(deep = True)
for element in emphasis_elements:
for item in clone.getElementsByTagName(element):
if item.tagName == u'bold':
item.tagName = u'b'
elif item.tagName == u'italic':
item.tagName = u'i'
elif item in spans:
item.tagName = u'span'
item.setAttribute('style', spans[item])
return clone
def getTagData(node_list):
'''Grab the (string) data from text elements
node_list -- NodeList returned by getElementsByTagName
'''
data = u''
try:
for node in node_list:
if node.firstChild.nodeType == node.TEXT_NODE:
data = node.firstChild.data
return data
except TypeError:
getTagData([node_list])
def epubZip(outdirect):
'''Zips up the input file directory into an ePub file.'''
epub_filename = outdirect + '.epub'
epub = zipfile.ZipFile(epub_filename, 'w')
current_dir = os.getcwd()
os.chdir(outdirect)
epub.write('mimetype')
recursive_zip(epub, 'META-INF')
recursive_zip(epub, 'OPS')
os.chdir(current_dir)
epub.close()
def recursive_zip(zipf, directory, folder = ""):
'''Recursively traverses the output directory to construct the zipfile'''
for item in os.listdir(directory):
if os.path.isfile(os.path.join(directory, item)):
zipf.write(os.path.join(directory, item), os.path.join(directory,
item))
elif os.path.isdir(os.path.join(directory, item)):
recursive_zip(zipf, os.path.join(directory, item),
os.path.join(folder, item))
def suggestedArticleTypes():
'''Returns a list of suggested values for article-type'''
#See http://dtd.nlm.nih.gov/publishing/tag-library/3.0/n-w2d0.html
s = ['abstract', 'addendum', 'announcement', 'article-commentary',
'book-review', 'books-received', 'brief-report', 'calendar',
'case-report', 'collection', 'correction', 'discussion',
'dissertation', 'editorial', 'in-brief', 'introduction', 'letter',
'meeting-report', 'news', 'obituary', 'oration',
'partial-retraction', 'product-review', 'rapid-communication',
'rapid-communication', 'reply', 'reprint', 'research-article',
'retraction', 'review-article', 'translation']
return(s)
def initiateDocument(titlestring,
_publicId = '-//W3C//DTD XHTML 1.1//EN',
_systemId = 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'):
'''A method for conveniently initiating a new xml.DOM Document'''
from xml.dom.minidom import getDOMImplementation
impl = getDOMImplementation()
mytype = impl.createDocumentType('article', _publicId, _systemId)
doc = impl.createDocument(None, 'root', mytype)
root = doc.lastChild #IGNORE:E1101
root.setAttribute('xmlns', 'http://www.w3.org/1999/xhtml')
root.setAttribute('xml:lang', 'en-US')
head = doc.createElement('head')
root.appendChild(head)
title = doc.createElement('title')
title.appendChild(doc.createTextNode(titlestring))
link = doc.createElement('link')
link.setAttribute('rel', 'stylesheet')
link.setAttribute('href','css/reference.css')
link.setAttribute('type', 'text/css')
meta = doc.createElement('meta')
meta.setAttribute('http-equiv', 'Content-Type')
meta.setAttribute('content', 'application/xhtml+xml')
meta.setAttribute('charset', 'utf-8')
headlist = [title, link, meta]
for tag in headlist:
head.appendChild(tag)
root.appendChild(head)
body = doc.createElement('body')
root.appendChild(body)
return doc, body
def scrapePLoSIssueCollection(issue_url):
'''Uses Beautiful Soup to scrape the PLoS page of an issue. It is used
instead of xml.dom.minidom because of malformed html/xml'''
from BeautifulSoup import BeautifulStoneSoup
import urllib2
import os
import os.path
iu = urllib2.urlopen(issue_url)
with open('temp','w') as temp:
temp.write(iu.read())
with open('temp', 'r') as temp:
soup = BeautifulStoneSoup(temp)
os.remove('temp')
#Map the journal urls to nice strings
jrns = {'plosgenetics': 'PLoS_Genetics', 'plosone' :'PLoS_ONE',
'plosntds': 'PLoS_Neglected_Tropical_Diseases', 'plosmedicine':
'PLoS_Medicine', 'plosbiology': 'PLoS_Biology', 'ploscompbiol':
'PLoS_Computational_Biology', 'plospathogens': 'PLoS_Pathogens'}
toc = soup.find('h1').string
date = toc.split('Table of Contents | ')[1].replace(' ', '_')
key = issue_url.split('http://www.')[1].split('.org')[0]
name = '{0}_{1}.txt'.format(jrns[key], date)
collection_name = os.path.join('collections', name)
with open(collection_name, 'w') as collection:
links = soup.findAll('a', attrs={'title': 'Read Open Access Article'})
for link in links:
href = link['href']
if href[:9] == '/article/':
id = href.split('10.1371%2F')[1].split(';')[0]
collection.write('doi:10.1371/{0}\n'.format(id))