#!/usr/bin/env python

# Make sure we can import Foundation and AppKit on Leopard, even on custom-installed Python.
import sys
additional_paths = [
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python25.zip',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/plat-darwin',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/plat-mac',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/plat-mac/lib-scriptpackages',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/Extras/lib/python',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/lib-tk',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/lib-dynload',
        '/Library/Python/2.5/site-packages',
        '/System/Library/Frameworks/Python.framework/Versions/2.5/Extras/lib/python/PyObjC',
]
for path_to_add in additional_paths:
	if path_to_add not in sys.path:
		sys.path.append(path_to_add)

import Foundation as F
import AppKit as AK

from itertools import imap as map
from itertools import izip as zip
from itertools import ifilter as filter
import os
import dircache
import string

# Python whines if you try to print unicode objects to stdout, so we need a function to convert such objects to UTF-8 to make it happy.
import codecs
utf8_encode = codecs.getencoder('utf-8')
def unicode_to_UTF8(uni):
	return utf8_encode(uni)[0]

# If any of these characters is in the string, we must put the string in quotes.
invalid_plist_characters = '\'" \\{}()<>/=' + ''.join(map(chr, (xrange(0, 0x20))))
# Translation to convert special characters into backslash sequences. Used in gen_chunks; see that code for the explanation of where the backslash comes from.
specials_to_escape_sequences = dict(zip(map(ord, u'\a\b\t\n\v\f\r'), u'abtnvfr'))
def escape_as_plist(string):
	"Return a string representing the input string and capable of being used as an OpenStep plist. repr would work, but also escapes Unicode characters into \u sequences, which is bad for readability."
	def anyof_in_(characters, string):
		for ch in characters:
			if ch in string:
				return True
		else:
			return False

	if anyof_in_(invalid_plist_characters, string):
		def gen_chunks(string):
			quote_char = '"'
			characters_to_escape = '\\' + quote_char + ''.join(map(chr, (xrange(0, 0x20))))

			yield quote_char

			# Yield each chunk of the string. A chunk is either a run of consecutive characters that don't need to be escaped, or an escape sequence (backslash + one character).
			range_start = 0
			range_end = 0
			for i, ch in enumerate(string):
				if ch in characters_to_escape:
					range_end = i
					if range_end > range_start:
						yield string[range_start:range_end]
					# Make sure we use a letter for a special character: for example, '\\' + 'n' = \n.
					yield '\\' + ch.translate(specials_to_escape_sequences)
					range_start = i + 1
			else:
				range_end = len(string)
				if range_end > range_start:
					yield string[range_start:range_end]

			yield quote_char

		string = ''.join(gen_chunks(string))

	return string

def warn_if_file_not_UTF16(pathname):
	"Reads as much of the named file as necessary to determine whether its contents are UTF-16, and print a warning to sys.stderr if they aren't."
	if file(pathname).read(2) not in ('\xfe\xff', '\xff\xfe', ):
		print >>sys.stderr, "!!! Warning: %s is not UTF-16 or is missing a byte-order mark" % (pathname,)

import optparse
parser = optparse.OptionParser(usage='Usage: %prog [options] <directories-to-search>', description="Searches the given directory trees for *.strings files, and compares the primary language of each one (default: English) to each of the other languages, and outputs any strings that are the same (on the theory that nobody has translated them yet).")
parser.add_option('--ignore-directory', action='append', dest='ignore_directories', type='string', help="Don't descend into these directories. Initial set: 'build' + VCS directories (.svn, .hg, etc.). Using --ignore-directory adds to this set. The set always includes *.lproj (that is, it won't look for *.lproj folders inside *.lproj folders).", default=['build', '.svn', 'CVS', '.hg', '.bzr', '_darcs'])
parser.add_option('--ignore-key', action='append', dest='ignore_keys', type='string', help="Don't report duplicates of these keys. Initial set: CFBundleName, CFBundleShortVersionString. Using --ignore-key adds to this set.", default=['CFBundleName', 'CFBundleShortVersionString'])
default_primary_language = 'en:English:en_US:en_UK:en_CA:en_AU'
parser.add_option('--primary-language', help='The language to compare other languages to. Should be a colon-separated list of language names, including all regional variations and the old-style name. Defaults to %r.' % (default_primary_language,), default=default_primary_language)

opts, args = parser.parse_args()
if not args:
	args = '.'

def gen_keys_with_matching_values(plist0, plist1):
	"Returns a list of keys."
	for key in plist0:
		try:
			value0 = plist0[key]
			value1 = plist1[key]
		except KeyError:
			pass
		else:
			if value0 == value1:
				yield key

english_names = [x + '.lproj' for x in opts.primary_language.split(':')]

could_not_read_last = False

for cur_arg in args:
	parent = F.NSString.stringWithString_(cur_arg)
	if parent.lastPathComponent().endswith('.lproj'):
		parent = parent.stringByDeletingLastPathComponent()

	num_problems = 0

	for (parent, dirnames, filenames) in os.walk(parent):
		if parent.startswith('./'):
			parent = parent[2:]

		# First, trim any directories that we don't want to descend into.
		for ignore in opts.ignore_directories:
			try:
				dirnames.remove(ignore)
			except ValueError:
				pass

		# If this isn't an lproj folder, look for lproj folders inside it.
		if os.path.splitext(parent)[1] != '.lproj':
			en_lproj = []
			not_lproj = []
			# As an optimization, we can skip the English localization altogether if there are no other localizations (i.e., if there's nothing to compare it to).
			any_other_lproj = False

			for lproj in list(dirnames):
				if os.path.splitext(lproj)[1] == '.lproj':
					if lproj in english_names:
						en_lproj.append(lproj)
					else:
						any_other_lproj = True
				else:
					not_lproj.append(lproj)
			dirnames[:] = (en_lproj if any_other_lproj else []) + not_lproj
		else:
			# parent is an lproj folder (presumably, an English one). Compare its contents to any other localizations.

			# Go up one directory, in order to compare the English version to each other version.
			parent, en_lproj = os.path.split(parent)
			lproj_names = []
			for dirname in dircache.listdir(parent):
				ext = os.path.splitext(dirname)[1]
				if ext == '.lproj':
					if dirname not in english_names:
						lproj_names.append(dirname)

			for filename in filenames:
				# We only work with strings files.
				pathname = os.path.join(parent, en_lproj, filename)
				if os.path.splitext(filename)[1] != '.strings':
					continue

				has_printed_header = False
				header = '*** Found problems in ' + os.path.join(parent, '*.lproj', filename)

				pathname = os.path.join(parent, en_lproj, filename)
				# Ignore strings files that don't exist in the primary language.
				if not os.path.exists(pathname):
					continue
				en_plist = F.NSDictionary.dictionaryWithContentsOfFile_(pathname)
				if en_plist is None:
					print >>sys.stderr, '*** Could not read primary-language plist file at path %s' % (pathname)
					print
					num_problems += 1
					continue

				warn_if_file_not_UTF16(pathname)

				for lproj in lproj_names:
					pathname = os.path.join(parent, lproj, filename)
					# Ignore strings files that don't exist in this language.
					if not os.path.exists(pathname):
						print '!!! Warning: Localized file %s is missing' % (pathname,)
						could_not_read_last = True
						num_problems += 1
						continue
					plist = F.NSDictionary.dictionaryWithContentsOfFile_(pathname)
					if plist is None:
						print >>sys.stderr, '*** Could not read plist file at path %s' % (pathname)
						could_not_read_last = True
						num_problems += 1
						continue

					warn_if_file_not_UTF16(pathname)

					keys_with_matching_values = [k for k in gen_keys_with_matching_values(en_plist, plist) if k not in opts.ignore_keys]

					# If we have at least one key with the same value, print the header for listing them.
					if keys_with_matching_values:
						if could_not_read_last:
							# We could read this one, but we should print a blank line after the last "Could not read plist file" message.
							print
							could_not_read_last = False

						if not has_printed_header:
							print header
							has_printed_header = True

						print 'Duplicate strings in %s between %s and %s:' % (filename, en_lproj, lproj)
						for k in keys_with_matching_values:
							if k not in opts.ignore_keys:
								print unicode_to_UTF8(escape_as_plist(k)), '=', unicode_to_UTF8(escape_as_plist(en_plist[k])) + ';'
								num_problems += 1
						print

					primary_keys = set(en_plist)
					other_keys = set(plist)

					missing_keys = primary_keys.difference(other_keys)
					if missing_keys:
						if not has_printed_header:
							print header
							has_printed_header = True

						print 'Missing strings in %s (found in %s):' % (os.path.join(lproj, filename), os.path.join(en_lproj, filename))
						for k in missing_keys:
							print unicode_to_UTF8(escape_as_plist(k))
							num_problems += 1

						print

				if could_not_read_last:
					# We couldn't read the last translation of this file. We should print a blank line after the last "Could not read plist file" message (i.e., between this file and the next).
					print
					could_not_read_last = False

	if cur_arg == '.':
		cur_arg = os.path.basename(os.path.realpath(os.curdir))
	print 'Found %u problems in %s' % (num_problems, cur_arg)
