Description: Fix insecure use of temporary files.
Origin: backport, http://bitbucket.org/jwilk/ocrodjvu/changeset/dedea346339f
Bug-Debian: http://bugs.debian.org/598134
Last-Update: 2010-09-26
--- a/lib/cuneiform.py
+++ b/lib/cuneiform.py
@@ -11,8 +11,12 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
+from __future__ import with_statement
+
import contextlib
import re
+import os
+import shutil
import tempfile
from cStringIO import StringIO
@@ -67,17 +71,24 @@
return language in get_languages()
def recognize(pbm_file, language):
- hocr_file = tempfile.NamedTemporaryFile(prefix='ocrodjvu.', suffix='.html')
- worker = ipc.Subprocess(
- ['cuneiform', '-l', iso_to_cuneiform(language), '-f', 'hocr', '-o', hocr_file.name, pbm_file.name],
- stdout=ipc.PIPE,
- env={}, # locale=POSIX
- )
- worker.wait()
+ hocr_directory = tempfile.mkdtemp(prefix='ocrodjvu.')
+ # A separate non-world-writable directory is needed, as Cuneiform
+ # can create additional files, e.g. images.
+ try:
+ hocr_file_name = os.path.join(hocr_directory, 'ocr.html')
+ worker = ipc.Subprocess(
+ ['cuneiform', '-l', iso_to_cuneiform(language), '-f', 'hocr', '-o', hocr_file_name, pbm_file.name],
+ stdout=ipc.PIPE,
+ env={}, # locale=POSIX
+ )
+ worker.wait()
+ with open(hocr_file_name, 'r') as hocr_file:
+ contents = hocr_file.read()
+ finally:
+ shutil.rmtree(hocr_directory)
# Sometimes Cuneiform returns files with broken encoding or with control
# characters: https://bugs.launchpad.net/cuneiform-linux/+bug/585418
# Let's fix it.
- contents = hocr_file.read()
contents = utils.sanitize_utf8(contents)
return contextlib.closing(StringIO(contents))