mirror of
https://github.com/GerbilSoft/rom-properties.git
synced 2025-06-18 11:35:38 -04:00
[libromdata] Xbox360_STFS: Fix titles for some packages that were authored incorrectly and have mojibake titles.
Two types of mojibake are detected and fixed: - UTF-8 parsed as cp1252 - Shift-JIS parsed as cp1252 For the latter case, utf16_to_cp1252() on Linux (and other systems that use iconv) has been modified to handle five code points that iconv() doesn't support: 0x81, 0x8D, 0x8F, 0x90, 0x9D. TODO: Also do this for utf8_to_cp1252() and other cp1252 functions? On Windows, WideCharToMultiByte() and MultiByteToWideChar() handle these code points without any issues. These three title updates now show a correct title: TU_11LK1UV_0000004000000.0000000000081 (UTF-8 as cp1252) Name: 'Dodonpachi Daifukkatsu Black Label Title Update #1' Title: '怒首領蜂 大復活 Black Label' TU_12501VG_0000004000000.0000000000101 (UTF-8 as cp1252) Name: 'DREAM C CLUB Title Update #4' Title: 'ドリームクラブ' TU_15LG1UH_000000C000000.0000000000083 (Shift-JIS as cp1252) Name: 'Circle of Students Title Update #1' Title: '円卓の生徒' Fixes #450: X360 - Non-Latin Titles appearing as mojibake Reported by @Masamune3210.
This commit is contained in:
parent
34f76050cb
commit
5de651dbbe
8
NEWS.md
8
NEWS.md
@ -2,11 +2,19 @@
|
||||
|
||||
## v2.6 (released 2025/??/??)
|
||||
|
||||
* New parser features:
|
||||
* Xbox360_STFS: Fix titles for some packages that were authored incorrectly
|
||||
and have mojibake titles. This includes cases where UTF-8 was parsed as
|
||||
cp1252, and where Shift-JIS was parsed as cp1252.
|
||||
* Fixes #450: X360 - Non-Latin Titles appearing as mojibake
|
||||
* Reported by @Masamune3210.
|
||||
|
||||
* Bug fixes:
|
||||
* Windows: Work around a potential libpng crash when attempting to read
|
||||
empty data as a PNG image. (Needs more debugging for a proper fix...)
|
||||
* See #451: libpng errors crash due to libpng setjmp/longjmp (Windows 10, release builds only)
|
||||
* Reported by @Masamune3210.
|
||||
|
||||
* Other changes:
|
||||
* rpcli: Added more colorization for warning messages.
|
||||
* rpcli: Refactored console handling into a separate library, libgsvt.
|
||||
|
@ -31,6 +31,7 @@ using namespace LibRpTexture;
|
||||
using std::array;
|
||||
using std::shared_ptr;
|
||||
using std::string;
|
||||
using std::u16string;
|
||||
using std::unique_ptr;
|
||||
using std::vector;
|
||||
|
||||
@ -151,6 +152,18 @@ public:
|
||||
* @return Default executable on success; nullptr on error.
|
||||
*/
|
||||
Xbox360_XEX *openDefaultXex(void);
|
||||
|
||||
public:
|
||||
/**
|
||||
* Get the title.
|
||||
*
|
||||
* Encoded as UTF-16BE, but some titles were incorrectly converted
|
||||
* from cp1252 when they should have been converted from Shift-JIS,
|
||||
* so this function has a heuristic to detect and fix this.
|
||||
*
|
||||
* @return Title
|
||||
*/
|
||||
string getTitle(void) const;
|
||||
};
|
||||
|
||||
ROMDATA_IMPL(Xbox360_STFS)
|
||||
@ -560,6 +573,66 @@ Xbox360_XEX *Xbox360_STFS_Private::openDefaultXex(void)
|
||||
return this->xex.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the title.
|
||||
*
|
||||
* The text was incorrectly converted from cp1252 to UTF-16BE, when the original
|
||||
* text was actually encoded as UTF-8. This function uses a heuristic to detect
|
||||
* this incorrect conversion and fix it.
|
||||
*
|
||||
* @return Title
|
||||
*/
|
||||
string Xbox360_STFS_Private::getTitle(void) const
|
||||
{
|
||||
// Check for common mojibake characters.
|
||||
static constexpr array<char16_t, 5> mojibake_chars = {{
|
||||
0x0192, // 'ƒ': LATIN SMALL LETTER F WITH HOOK
|
||||
0x00E6, // 'æ': LATIN SMALL LETTER AE
|
||||
0x20AC, // '€': EURO SIGN
|
||||
0x2020, // '†': DAGGER
|
||||
}};
|
||||
|
||||
// TODO: Also check for Japanese characters.
|
||||
// If found, this is *not* an incorrect conversion.
|
||||
bool isMojibake = false;
|
||||
bool isMojibakeSJIS = false;
|
||||
for (const char16_t *p = stfsMetadata.title_name; *p != 0; p++) {
|
||||
// FIXME: Use proper byteswapping as the cases instead of using be16_to_cpu().
|
||||
const char16_t c = be16_to_cpu(*p);
|
||||
switch (c) {
|
||||
case 0x0192: // 'ƒ': LATIN SMALL LETTER F WITH HOOK
|
||||
case 0x00E6: // 'æ': LATIN SMALL LETTER AE
|
||||
case 0x20AC: // '€': EURO SIGN
|
||||
case 0x2020: // '†': DAGGER
|
||||
// Likely mojibake
|
||||
isMojibake = true;
|
||||
break;
|
||||
|
||||
case 0x0081:
|
||||
// Likely mojibake, with Shift-JIS encoding instead of UTF-8.
|
||||
isMojibake = true;
|
||||
isMojibakeSJIS = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(!isMojibake)) {
|
||||
return utf16be_to_utf8(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
|
||||
}
|
||||
|
||||
// Convert to cp1252. This will actually be either UTF-8 or Shift-JIS.
|
||||
#if SYS_BYTEORDER == SYS_LIL_ENDIAN
|
||||
u16string str_utf16 = utf16be_to_utf16(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
|
||||
string str_cp1252 = utf16_to_cp1252(str_utf16.data(), static_cast<int>(str_utf16.size()));
|
||||
return utf16_to_cp1252(str_utf16.data(), static_cast<int>(str_utf16.size()));
|
||||
#else /* SYS_BYTEORDER == SYS_BIG_ENDIAN */
|
||||
return utf16_to_cp1252(stfsMetadata.title_name, ARRAY_SIZE(stfsMetadata.title_name));
|
||||
#endif
|
||||
}
|
||||
|
||||
/** Xbox360_STFS **/
|
||||
|
||||
/**
|
||||
@ -957,9 +1030,7 @@ int Xbox360_STFS::loadFieldData(void)
|
||||
|
||||
// Title
|
||||
if (stfsMetadata->title_name[0] != 0) {
|
||||
d->fields.addField_string(C_("RomData", "Title"),
|
||||
utf16be_to_utf8(stfsMetadata->title_name,
|
||||
ARRAY_SIZE(stfsMetadata->title_name)));
|
||||
d->fields.addField_string(C_("RomData", "Title"), d->getTitle());
|
||||
}
|
||||
|
||||
// File type
|
||||
|
@ -317,18 +317,52 @@ static inline std::string utf8_to_cp1252(const char *str, int len)
|
||||
return utf8_to_cpN(1252, str, len);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
/**
|
||||
* Convert UTF-16 text to cp1252.
|
||||
* Trailing NULL bytes will be removed.
|
||||
* Invalid characters will be ignored.
|
||||
* @param wcs [in] UTF-16 text.
|
||||
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
|
||||
*
|
||||
* NOTE: On Windows, WideCharToMultiByte() handles "invalid" cp1252 characters.
|
||||
* We don't need to handle it ourselves.
|
||||
*
|
||||
* @param wcs [in] UTF-16 text
|
||||
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
|
||||
* @return cp1252 string.
|
||||
*/
|
||||
static inline std::string utf16_to_cp1252(const char16_t *wcs, int len)
|
||||
{
|
||||
return utf16_to_cpN(1252, wcs, len);
|
||||
}
|
||||
#else /* !_WIN32 */
|
||||
/**
|
||||
* Convert UTF-16 text to cp1252.
|
||||
* Trailing NULL bytes will be removed.
|
||||
*
|
||||
* NOTE: On non-Windows systems, iconv() does *not* handle "invalid" cp1252 characters.
|
||||
* This function preprocesses the string in order to process those characters.
|
||||
* This is needed in order to handle Xbox 360 "mojibake" encoding.
|
||||
*
|
||||
* @param wcs [in] UTF-16 text
|
||||
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
|
||||
* @return cp1252 string.
|
||||
*/
|
||||
std::string utf16_to_cp1252(const char16_t *wcs, int len);
|
||||
#endif /* _WIN32 */
|
||||
|
||||
/**
|
||||
* Convert UTF-16 text to cp1252.
|
||||
* Trailing NULL bytes will be removed.
|
||||
*
|
||||
* NOTE: Specialized function that does *not* ignore "invalid" cp1252 characters.
|
||||
* This is needed in order to handle Xbox 360 "mojibake" encoding.
|
||||
*
|
||||
* @param wcs [in] UTF-16 string
|
||||
* @return cp1252 string.
|
||||
*/
|
||||
static inline std::string utf16_to_cp1252(const std::u16string &wcs)
|
||||
{
|
||||
return utf16_to_cp1252(wcs.data(), static_cast<int>(wcs.size()));
|
||||
}
|
||||
|
||||
/* Shift-JIS (cp932) with cp1252 fallback */
|
||||
|
||||
|
@ -41,8 +41,10 @@ static const char RP_ICONV_UTF16_ENCODING[] = "UTF-16LE";
|
||||
#include <cassert>
|
||||
|
||||
// C++ STL classes
|
||||
#include <vector>
|
||||
using std::string;
|
||||
using std::u16string;
|
||||
using std::vector;
|
||||
|
||||
namespace LibRpText {
|
||||
|
||||
@ -451,4 +453,74 @@ string utf16be_to_utf8(const char16_t *wcs, int len)
|
||||
return INT_utf16_to_utf8("UTF-16BE", wcs, len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert UTF-16 text to cp1252.
|
||||
* Trailing NULL bytes will be removed.
|
||||
*
|
||||
* NOTE: On non-Windows systems, iconv() does *not* handle "invalid" cp1252 characters.
|
||||
* This function preprocesses the string in order to process those characters.
|
||||
* This is needed in order to handle Xbox 360 "mojibake" encoding.
|
||||
*
|
||||
* @param wcs [in] UTF-16 text
|
||||
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
|
||||
* @return cp1252 string.
|
||||
*/
|
||||
std::string utf16_to_cp1252(const char16_t *wcs, int len)
|
||||
{
|
||||
len = check_NULL_terminator(wcs, len);
|
||||
|
||||
// Find any "invalid" cp1252 characters.
|
||||
// Character indexes in wcs are stored, which should map directly
|
||||
// to character indexes in the resulting string.
|
||||
// FIXME: This may break if non-BMP characters are present.
|
||||
vector<int> char_idx;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
switch (wcs[i]) {
|
||||
default:
|
||||
break;
|
||||
|
||||
case 0x0081:
|
||||
case 0x008D:
|
||||
case 0x008F:
|
||||
case 0x0090:
|
||||
case 0x009D:
|
||||
// Invalid cp1252 character.
|
||||
char_idx.push_back(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (char_idx.empty()) {
|
||||
// No invalid characters. Convert it directly.
|
||||
char *mbs = reinterpret_cast<char*>(rp_iconv((char*)wcs, len*sizeof(*wcs), RP_ICONV_UTF16_ENCODING, "CP1252", false));
|
||||
if (!mbs) {
|
||||
// Conversion failed...
|
||||
return {};
|
||||
}
|
||||
|
||||
return string(mbs);
|
||||
}
|
||||
|
||||
// Convert using "//TRANSLIT", then manually replace the bad characters.
|
||||
char *mbs = reinterpret_cast<char*>(rp_iconv((char*)wcs, len*sizeof(*wcs), RP_ICONV_UTF16_ENCODING, "CP1252//TRANSLIT", false));
|
||||
if (!mbs) {
|
||||
// Conversion failed...
|
||||
return {};
|
||||
}
|
||||
|
||||
string str(mbs);
|
||||
const int str_size = static_cast<int>(str.size());
|
||||
for (int idx : char_idx) {
|
||||
assert(idx < str_size);
|
||||
if (idx >= str_size) {
|
||||
// Invalid index?
|
||||
return {};
|
||||
}
|
||||
str[idx] = static_cast<char>(static_cast<uint8_t>(wcs[idx]));
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user