pygettext.py
上传用户:gyjinxi
上传日期:2007-01-04
资源大小:159k
文件大小:11k
- #! /usr/bin/env python
- # Originally written by Barry Warsaw <bwarsaw@python.org>
- """pygettext -- Python equivalent of xgettext(1)
- Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
- internationalization of C programs. Most of these tools are independent of
- the programming language and can be used from within Python programs. Martin
- von Loewis' work[1] helps considerably in this regard.
- There's one problem though; xgettext is the program that scans source code
- looking for message strings, but it groks only C (or C++). Python introduces
- a few wrinkles, such as dual quoting characters, triple quoted strings, and
- raw strings. xgettext understands none of this.
- Enter pygettext, which uses Python's standard tokenize module to scan Python
- source code, generating .pot files identical to what GNU xgettext[2] generates
- for C and C++ code. From there, the standard GNU tools can be used.
- A word about marking Python strings as candidates for translation. GNU
- xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
- gettext_noop. But those can be a lot of text to include all over your code.
- C and C++ have a trick: they use the C preprocessor. Most internationalized C
- source includes a #define for gettext() to _() so that what has to be written
- in the source is much less. Thus these are both translatable strings:
- gettext("Translatable String")
- _("Translatable String")
- Python of course has no preprocessor so this doesn't work so well. Thus,
- pygettext searches only for _() by default, but see the -k/--keyword flag
- below for how to augment this.
- [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
- [2] http://www.gnu.org/software/gettext/gettext.html
- NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
- where ever possible.
- Usage: pygettext [options] filename ...
- Options:
- -a
- --extract-all
- Extract all strings
- -d default-domain
- --default-domain=default-domain
- Rename the default output file from messages.pot to default-domain.pot
- -k [word]
- --keyword[=word]
- Additional keywords to look for. Without `word' means not to use the
- default keywords. The default keywords, which are always looked for
- if not explicitly disabled: _
- The default keyword list is different than GNU xgettext. You can have
- multiple -k flags on the command line.
- --no-location
- Do not write filename/lineno location comments
- -n [style]
- --add-location[=style]
- Write filename/lineno location comments indicating where each
- extracted string is found in the source. These lines appear before
- each msgid. Two styles are supported:
- Solaris # File: filename, line: line-number
- Gnu #: filename:line
- If style is omitted, Gnu is used. The style name is case
- insensitive. By default, locations are included.
- -v
- --verbose
- Print the names of the files being processed.
- --help
- -h
- print this help message and exit
- """
- import os
- import sys
- import string
- import time
- import getopt
- import tokenize
- __version__ = '0.2-1'
- # for selftesting
- def _(s): return s
- # The normal pot-file header. msgmerge and EMACS' po-mode work better if
- # it's there.
- pot_header = _('''
- # SOME DESCRIPTIVE TITLE.
- # Copyright (C) YEAR ORGANIZATION
- # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
- #
- msgid ""
- msgstr ""
- "Project-Id-Version: PACKAGE VERSION\n"
- "PO-Revision-Date: %(time)s\n"
- "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
- "Language-Team: LANGUAGE <LL@li.org>\n"
- "MIME-Version: 1.0\n"
- "Content-Type: text/plain; charset=CHARSET\n"
- "Content-Transfer-Encoding: ENCODING\n"
- "Generated-By: pygettext.py %(version)s\n"
- ''')
- def usage(code, msg=''):
- print __doc__ % globals()
- if msg:
- print msg
- sys.exit(code)
- escapes = []
- for i in range(256):
- if i < 32 or i > 127:
- escapes.append("\%03o" % i)
- else:
- escapes.append(chr(i))
- escapes[ord('\')] = '\\'
- escapes[ord('t')] = '\t'
- escapes[ord('r')] = '\r'
- escapes[ord('n')] = '\n'
- def escape(s):
- s = list(s)
- for i in range(len(s)):
- s[i] = escapes[ord(s[i])]
- return string.join(s, '')
- def safe_eval(s):
- # unwrap quotes, safely
- return eval(s, {'__builtins__':{}}, {})
- def normalize(s):
- # This converts the various Python string types into a format that is
- # appropriate for .po files, namely much closer to C style.
- lines = string.split(s, 'n')
- if len(lines) == 1:
- s = '"' + escape(s) + '"'
- else:
- if not lines[-1]:
- del lines[-1]
- lines[-1] = lines[-1] + 'n'
- for i in range(len(lines)):
- lines[i] = escape(lines[i])
- s = '""n"' + string.join(lines, '\n"n"') + '"'
- return s
- try: from cStringIO import StringIO
- except: from StringIO import StringIO
- from string import find, split
- from sgmllib import SGMLParser
- class I18NParser(SGMLParser):
- gettext_tags = ["dtml-gettext", "dtml-gt"]
- def __init__(self):
- SGMLParser.__init__(self)
- self.data = ""
- self.messages = []
- self.startpos = 0
-
- def unknown_starttag(self, tag, attrs):
- for k, v in attrs:
- for t in self.gettext_tags:
- if find(v, t):
- p = I18NParser()
- p.feed(v)
- if p.messages: self.messages.extend(p.messages)
- break
- if tag in self.gettext_tags:
- self.data = ""
- def unknown_endtag(self, tag):
- if tag in self.gettext_tags:
- f = find(self.rawdata, self.data, self.startpos)
- row = len(split(self.rawdata[:f], "n"))
- self.messages.append((self.data, row))
- self.startpos = f + 1
- self.data = ""
- def handle_entityref(self, name): self.handle_data("&%s;" % name)
- def handle_charref(self, name): self.handle_data("&#%s;" % name)
- def handle_comment(self, data): self.handle_data("<!--%s-->" % data)
- def handle_data(self, data): self.data = self.data + data
- def htmlfilter(fp):
- buf, row = "", 1
- p = I18NParser()
- p.feed(fp.read())
- for msg, lineno in p.messages:
- while row < lineno:
- buf = buf + "n"
- row = row + 1
- buf = buf + '_("%s"); ' % msg
- return StringIO(buf)
- class TokenEater:
- def __init__(self, options):
- self.__options = options
- self.__messages = {}
- self.__state = self.__waiting
- self.__data = []
- self.__lineno = -1
- def __call__(self, ttype, tstring, stup, etup, line):
- # dispatch
- self.__state(ttype, tstring, stup[0])
- def __waiting(self, ttype, tstring, lineno):
- if ttype == tokenize.NAME and tstring in self.__options.keywords:
- self.__state = self.__keywordseen
- def __keywordseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == '(':
- self.__data = []
- self.__lineno = lineno
- self.__state = self.__openseen
- else:
- self.__state = self.__waiting
- def __openseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == ')':
- # We've seen the last of the translatable strings. Record the
- # line number of the first line of the strings and update the list
- # of messages seen. Reset state for the next batch. If there
- # were no strings inside _(), then just ignore this entry.
- if self.__data:
- msg = string.join(self.__data, '')
- entry = (self.__curfile, self.__lineno)
- linenos = self.__messages.get(msg)
- if linenos is None:
- self.__messages[msg] = [entry]
- else:
- linenos.append(entry)
- self.__state = self.__waiting
- elif ttype == tokenize.STRING:
- self.__data.append(safe_eval(tstring))
- # TBD: should we warn if we seen anything else?
- def set_filename(self, filename):
- self.__curfile = filename
- def write(self, fp):
- options = self.__options
- timestamp = time.ctime(time.time())
- # common header
- try:
- sys.stdout = fp
- # The time stamp in the header doesn't have the same format
- # as that generated by xgettext...
- print pot_header % {'time': timestamp, 'version':__version__}
- for k, v in self.__messages.items():
- for filename, lineno in v:
- # location comments are different b/w Solaris and GNU
- d = {'filename': filename,
- 'lineno': lineno}
- if options.location == options.SOLARIS:
- print _('# File: %(filename)s, line: %(lineno)d') % d
- elif options.location == options.GNU:
- print _('#: %(filename)s:%(lineno)d') % d
- # TBD: sorting, normalizing
- print 'msgid', normalize(k)
- print 'msgstr ""'
- print
- finally:
- sys.stdout = sys.__stdout__
- def main():
- default_keywords = ['_']
- try:
- opts, args = getopt.getopt(
- sys.argv[1:],
- 'k:d:n:hv',
- ['keyword', 'default-domain', 'help',
- 'add-location=', 'no-location', 'verbose'])
- except getopt.error, msg:
- usage(1, msg)
- # for holding option values
- class Options:
- # constants
- GNU = 1
- SOLARIS = 2
- # defaults
- keywords = []
- outfile = 'messages.pot'
- location = GNU
- verbose = 0
- options = Options()
- locations = {'gnu' : options.GNU,
- 'solaris' : options.SOLARIS,
- }
- # parse options
- for opt, arg in opts:
- if opt in ('-h', '--help'):
- usage(0)
- elif opt in ('-k', '--keyword'):
- if arg is None:
- default_keywords = []
- options.keywords.append(arg)
- elif opt in ('-d', '--default-domain'):
- options.outfile = arg + '.pot'
- elif opt in ('-n', '--add-location'):
- if arg is None:
- arg = 'gnu'
- try:
- options.location = locations[string.lower(arg)]
- except KeyError:
- d = {'arg':arg}
- usage(1, _('Invalid value for --add-location: %(arg)s') % d)
- elif opt in ('--no-location',):
- options.location = 0
- elif opt in ('-v', '--verbose'):
- options.verbose = 1
- # calculate all keywords
- options.keywords.extend(default_keywords)
- # slurp through all the files
- eater = TokenEater(options)
- for filename in args:
- if options.verbose:
- print _('Working on %(filename)s') % {'filename':filename}
- ext = os.path.splitext(filename)[1]
- if ext in [".htm", ".html"]:
- fp = htmlfilter(open(filename))
- else: fp = open(filename)
- eater.set_filename(filename)
- tokenize.tokenize(fp.readline, eater)
- fp.close()
- fp = open(options.outfile, 'w')
- eater.write(fp)
- fp.close()
- if __name__ == '__main__':
- main()