diff --git a/FSCHMasterEditor.xcodeproj/project.pbxproj b/FSCHMasterEditor.xcodeproj/project.pbxproj index 64fb9c4..418fe8a 100644 --- a/FSCHMasterEditor.xcodeproj/project.pbxproj +++ b/FSCHMasterEditor.xcodeproj/project.pbxproj @@ -7,13 +7,47 @@ objects = { /* Begin PBXBuildFile section */ + B92B235328D671D800A78F39 /* PreferencePane.m in Sources */ = {isa = PBXBuildFile; fileRef = B92B235128D671D800A78F39 /* PreferencePane.m */; }; + B92B235428D671D800A78F39 /* PreferencePane.xib in Resources */ = {isa = PBXBuildFile; fileRef = B92B235228D671D800A78F39 /* PreferencePane.xib */; }; + B92B235728D6744400A78F39 /* PreferencesManager.m in Sources */ = {isa = PBXBuildFile; fileRef = B92B235628D6744400A78F39 /* PreferencesManager.m */; }; + B949CB8428DA76C9002C81CD /* CollectionLayout.m in Sources */ = {isa = PBXBuildFile; fileRef = B949CB8328DA76C9002C81CD /* CollectionLayout.m */; }; B96675EC28CBF33B00C2ECCE /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = B96675EB28CBF33B00C2ECCE /* AppDelegate.m */; }; B96675EE28CBF33D00C2ECCE /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B96675ED28CBF33D00C2ECCE /* Assets.xcassets */; }; B96675F128CBF33D00C2ECCE /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = B96675EF28CBF33D00C2ECCE /* MainMenu.xib */; }; B96675F328CBF33D00C2ECCE /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = B96675F228CBF33D00C2ECCE /* main.m */; }; + B96675FD28CBF81000C2ECCE /* ROMHeaderViewer.m in Sources */ = {isa = PBXBuildFile; fileRef = B96675FB28CBF81000C2ECCE /* ROMHeaderViewer.m */; }; + B96675FE28CBF81000C2ECCE /* ROMHeaderViewer.xib in Resources */ = {isa = PBXBuildFile; fileRef = B96675FC28CBF81000C2ECCE /* ROMHeaderViewer.xib */; }; + B966760228CBFA5900C2ECCE /* RHMainWindow.m in Sources */ = {isa = PBXBuildFile; fileRef = B966760028CBFA5900C2ECCE /* RHMainWindow.m */; }; + B966760328CBFA5900C2ECCE /* RHMainWindow.xib in Resources */ = {isa = PBXBuildFile; fileRef = B966760128CBFA5900C2ECCE /* RHMainWindow.xib */; }; + B966760728CC226C00C2ECCE /* IBMPlexMono-Bold.ttf in Resources */ = {isa = PBXBuildFile; fileRef = B966760428CC226C00C2ECCE /* IBMPlexMono-Bold.ttf */; }; + B966760828CC226C00C2ECCE /* IBMPlexMono-Regular.ttf in Resources */ = {isa = PBXBuildFile; fileRef = B966760528CC226C00C2ECCE /* IBMPlexMono-Regular.ttf */; }; + B966760928CC226C00C2ECCE /* IBMPlexMono-Light.ttf in Resources */ = {isa = PBXBuildFile; fileRef = B966760628CC226C00C2ECCE /* IBMPlexMono-Light.ttf */; }; + B966761128CDAAE500C2ECCE /* NDSCartridgeData.m in Sources */ = {isa = PBXBuildFile; fileRef = B966761028CDAAE500C2ECCE /* NDSCartridgeData.m */; }; + B966F64428DD7B4500BFA6D8 /* colorspacehandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = B966F63D28DD7B4400BFA6D8 /* colorspacehandler.cpp */; }; + B966F64E28DD9FA100BFA6D8 /* SyncScrollView.m in Sources */ = {isa = PBXBuildFile; fileRef = B966F64D28DD9FA100BFA6D8 /* SyncScrollView.m */; }; + B966F65128DE6DE100BFA6D8 /* PopoverInformationView.m in Sources */ = {isa = PBXBuildFile; fileRef = B966F65028DE6DE100BFA6D8 /* PopoverInformationView.m */; }; + B96CBA4528D82D4A00FCC2CA /* HexTextView.m in Sources */ = {isa = PBXBuildFile; fileRef = B96CBA4328D82D4A00FCC2CA /* HexTextView.m */; }; + B96CBA4628D82D4A00FCC2CA /* HexTextView.xib in Resources */ = {isa = PBXBuildFile; fileRef = B96CBA4428D82D4A00FCC2CA /* HexTextView.xib */; }; + B96CBA4A28D8FE2B00FCC2CA /* AddressTextView.m in Sources */ = {isa = PBXBuildFile; fileRef = B96CBA4828D8FE2B00FCC2CA /* AddressTextView.m */; }; + B96CBA4B28D8FE2B00FCC2CA /* AddressTextView.xib in Resources */ = {isa = PBXBuildFile; fileRef = B96CBA4928D8FE2B00FCC2CA /* AddressTextView.xib */; }; + B96CBA4F28DA671800FCC2CA /* ASCIITextView.m in Sources */ = {isa = PBXBuildFile; fileRef = B96CBA4D28DA671800FCC2CA /* ASCIITextView.m */; }; + B96CBA5028DA671800FCC2CA /* ASCIITextView.xib in Resources */ = {isa = PBXBuildFile; fileRef = B96CBA4E28DA671800FCC2CA /* ASCIITextView.xib */; }; + B9A79A2128DFDE3100966F34 /* NSString+WordCount.m in Sources */ = {isa = PBXBuildFile; fileRef = B9A79A2028DFDE3100966F34 /* NSString+WordCount.m */; }; + B9A79A2528DFEEC100966F34 /* README.txt in Resources */ = {isa = PBXBuildFile; fileRef = B9A79A2428DFEEC100966F34 /* README.txt */; }; + B9A79A2A28DFF08600966F34 /* Cocoa.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B9A79A2928DFF08600966F34 /* Cocoa.framework */; }; + B9A79A2C28DFF08A00966F34 /* AppKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = B9A79A2B28DFF08A00966F34 /* AppKit.framework */; }; + B9B4B4F428D5786000F629E1 /* ColorGenerator.m in Sources */ = {isa = PBXBuildFile; fileRef = B9B4B4F328D5786000F629E1 /* ColorGenerator.m */; }; + B9B4B4F628D57B6A00F629E1 /* ndsheaderspec.plist in Resources */ = {isa = PBXBuildFile; fileRef = B9B4B4F528D57B6A00F629E1 /* ndsheaderspec.plist */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ + B92B235028D671D800A78F39 /* PreferencePane.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PreferencePane.h; sourceTree = ""; }; + B92B235128D671D800A78F39 /* PreferencePane.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PreferencePane.m; sourceTree = ""; }; + B92B235228D671D800A78F39 /* PreferencePane.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = PreferencePane.xib; sourceTree = ""; }; + B92B235528D6744400A78F39 /* PreferencesManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PreferencesManager.h; sourceTree = ""; }; + B92B235628D6744400A78F39 /* PreferencesManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PreferencesManager.m; sourceTree = ""; }; + B949CB8228DA76C9002C81CD /* CollectionLayout.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CollectionLayout.h; sourceTree = ""; }; + B949CB8328DA76C9002C81CD /* CollectionLayout.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = CollectionLayout.m; sourceTree = ""; }; B96675E728CBF33B00C2ECCE /* FSCHMasterEditor.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = FSCHMasterEditor.app; sourceTree = BUILT_PRODUCTS_DIR; }; B96675EA28CBF33B00C2ECCE /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; B96675EB28CBF33B00C2ECCE /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; @@ -21,6 +55,56 @@ B96675F028CBF33D00C2ECCE /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/MainMenu.xib; sourceTree = ""; }; B96675F228CBF33D00C2ECCE /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = ""; }; B96675F428CBF33D00C2ECCE /* FSCHMasterEditor.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = FSCHMasterEditor.entitlements; sourceTree = ""; }; + B96675FA28CBF81000C2ECCE /* ROMHeaderViewer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ROMHeaderViewer.h; sourceTree = ""; }; + B96675FB28CBF81000C2ECCE /* ROMHeaderViewer.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ROMHeaderViewer.m; sourceTree = ""; }; + B96675FC28CBF81000C2ECCE /* ROMHeaderViewer.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = ROMHeaderViewer.xib; sourceTree = ""; }; + B96675FF28CBFA5900C2ECCE /* RHMainWindow.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = RHMainWindow.h; sourceTree = ""; }; + B966760028CBFA5900C2ECCE /* RHMainWindow.m */ = {isa = PBXFileReference; explicitFileType = sourcecode.c.objc; path = RHMainWindow.m; sourceTree = ""; }; + B966760128CBFA5900C2ECCE /* RHMainWindow.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = RHMainWindow.xib; sourceTree = ""; }; + B966760428CC226C00C2ECCE /* IBMPlexMono-Bold.ttf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "IBMPlexMono-Bold.ttf"; sourceTree = ""; }; + B966760528CC226C00C2ECCE /* IBMPlexMono-Regular.ttf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "IBMPlexMono-Regular.ttf"; sourceTree = ""; }; + B966760628CC226C00C2ECCE /* IBMPlexMono-Light.ttf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "IBMPlexMono-Light.ttf"; sourceTree = ""; }; + B966760B28CC66EE00C2ECCE /* NDSCartridge.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = NDSCartridge.h; sourceTree = ""; }; + B966760F28CDAAE500C2ECCE /* NDSCartridgeData.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = NDSCartridgeData.h; sourceTree = ""; }; + B966761028CDAAE500C2ECCE /* NDSCartridgeData.m */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.objcpp; path = NDSCartridgeData.m; sourceTree = ""; }; + B966F63428DD7B4400BFA6D8 /* colorspacehandler_AVX512.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX512.cpp; sourceTree = ""; }; + B966F63528DD7B4400BFA6D8 /* colorspacehandler_SSE2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_SSE2.h; sourceTree = ""; }; + B966F63628DD7B4400BFA6D8 /* colorspacehandler_AVX512.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX512.h; sourceTree = ""; }; + B966F63728DD7B4400BFA6D8 /* colorspacehandler_AVX2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AVX2.cpp; sourceTree = ""; }; + B966F63828DD7B4400BFA6D8 /* colorspacehandler_NEON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_NEON.cpp; sourceTree = ""; }; + B966F63928DD7B4400BFA6D8 /* colorspacehandler_NEON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_NEON.h; sourceTree = ""; }; + B966F63A28DD7B4400BFA6D8 /* colorspacehandler_AVX2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AVX2.h; sourceTree = ""; }; + B966F63B28DD7B4400BFA6D8 /* colorspacehandler_SSE2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_SSE2.cpp; sourceTree = ""; }; + B966F63C28DD7B4400BFA6D8 /* colorspacehandler_AltiVec.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler_AltiVec.h; sourceTree = ""; }; + B966F63D28DD7B4400BFA6D8 /* colorspacehandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler.cpp; sourceTree = ""; }; + B966F63E28DD7B4400BFA6D8 /* colorspacehandler_AltiVec.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colorspacehandler_AltiVec.cpp; sourceTree = ""; }; + B966F63F28DD7B4400BFA6D8 /* colorspacehandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colorspacehandler.h; sourceTree = ""; }; + B966F64628DD7B5A00BFA6D8 /* types.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = types.h; sourceTree = ""; }; + B966F64728DD7B9B00BFA6D8 /* retro_miscellaneous.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = retro_miscellaneous.h; sourceTree = ""; }; + B966F64828DD7BAE00BFA6D8 /* retro_inline.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = retro_inline.h; sourceTree = ""; }; + B966F64928DD9E7B00BFA6D8 /* FSCHMasterEditor-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "FSCHMasterEditor-Bridging-Header.h"; sourceTree = ""; }; + B966F64C28DD9FA100BFA6D8 /* SyncScrollView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SyncScrollView.h; sourceTree = ""; }; + B966F64D28DD9FA100BFA6D8 /* SyncScrollView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SyncScrollView.m; sourceTree = ""; }; + B966F64F28DE6DE100BFA6D8 /* PopoverInformationView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PopoverInformationView.h; sourceTree = ""; }; + B966F65028DE6DE100BFA6D8 /* PopoverInformationView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PopoverInformationView.m; sourceTree = ""; }; + B96CBA4228D82D4A00FCC2CA /* HexTextView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = HexTextView.h; sourceTree = ""; }; + B96CBA4328D82D4A00FCC2CA /* HexTextView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = HexTextView.m; sourceTree = ""; }; + B96CBA4428D82D4A00FCC2CA /* HexTextView.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = HexTextView.xib; sourceTree = ""; }; + B96CBA4728D8FE2B00FCC2CA /* AddressTextView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AddressTextView.h; sourceTree = ""; }; + B96CBA4828D8FE2B00FCC2CA /* AddressTextView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AddressTextView.m; sourceTree = ""; }; + B96CBA4928D8FE2B00FCC2CA /* AddressTextView.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = AddressTextView.xib; sourceTree = ""; }; + B96CBA4C28DA671800FCC2CA /* ASCIITextView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ASCIITextView.h; sourceTree = ""; }; + B96CBA4D28DA671800FCC2CA /* ASCIITextView.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ASCIITextView.m; sourceTree = ""; }; + B96CBA4E28DA671800FCC2CA /* ASCIITextView.xib */ = {isa = PBXFileReference; lastKnownFileType = file.xib; path = ASCIITextView.xib; sourceTree = ""; }; + B9A79A1F28DFDE3100966F34 /* NSString+WordCount.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "NSString+WordCount.h"; sourceTree = ""; }; + B9A79A2028DFDE3100966F34 /* NSString+WordCount.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = "NSString+WordCount.m"; sourceTree = ""; }; + B9A79A2428DFEEC100966F34 /* README.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = README.txt; sourceTree = ""; }; + B9A79A2728DFF07D00966F34 /* CryptoKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CryptoKit.framework; path = System/Library/Frameworks/CryptoKit.framework; sourceTree = SDKROOT; }; + B9A79A2928DFF08600966F34 /* Cocoa.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Cocoa.framework; path = System/Library/Frameworks/Cocoa.framework; sourceTree = SDKROOT; }; + B9A79A2B28DFF08A00966F34 /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; }; + B9B4B4F228D5786000F629E1 /* ColorGenerator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ColorGenerator.h; sourceTree = ""; }; + B9B4B4F328D5786000F629E1 /* ColorGenerator.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ColorGenerator.m; sourceTree = ""; }; + B9B4B4F528D57B6A00F629E1 /* ndsheaderspec.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = ndsheaderspec.plist; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -28,6 +112,8 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + B9A79A2C28DFF08A00966F34 /* AppKit.framework in Frameworks */, + B9A79A2A28DFF08600966F34 /* Cocoa.framework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -39,6 +125,7 @@ children = ( B96675E928CBF33B00C2ECCE /* FSCHMasterEditor */, B96675E828CBF33B00C2ECCE /* Products */, + B9A79A2628DFF07D00966F34 /* Frameworks */, ); sourceTree = ""; }; @@ -53,16 +140,120 @@ B96675E928CBF33B00C2ECCE /* FSCHMasterEditor */ = { isa = PBXGroup; children = ( + B9A79A2328DFDE5000966F34 /* prefs */, + B9A79A2228DFDE3C00966F34 /* nds */, + B966760A28CC227500C2ECCE /* fonts */, B96675EA28CBF33B00C2ECCE /* AppDelegate.h */, B96675EB28CBF33B00C2ECCE /* AppDelegate.m */, + B96675FA28CBF81000C2ECCE /* ROMHeaderViewer.h */, + B96675FB28CBF81000C2ECCE /* ROMHeaderViewer.m */, + B96675FC28CBF81000C2ECCE /* ROMHeaderViewer.xib */, + B96675FF28CBFA5900C2ECCE /* RHMainWindow.h */, + B966760028CBFA5900C2ECCE /* RHMainWindow.m */, + B966760128CBFA5900C2ECCE /* RHMainWindow.xib */, + B966F64F28DE6DE100BFA6D8 /* PopoverInformationView.h */, + B966F65028DE6DE100BFA6D8 /* PopoverInformationView.m */, + B9B4B4F228D5786000F629E1 /* ColorGenerator.h */, + B9B4B4F328D5786000F629E1 /* ColorGenerator.m */, + B9A79A1F28DFDE3100966F34 /* NSString+WordCount.h */, + B9A79A2028DFDE3100966F34 /* NSString+WordCount.m */, + B96CBA3C28D8299600FCC2CA /* CustomViews */, + B9B4B4F528D57B6A00F629E1 /* ndsheaderspec.plist */, B96675ED28CBF33D00C2ECCE /* Assets.xcassets */, B96675EF28CBF33D00C2ECCE /* MainMenu.xib */, B96675F228CBF33D00C2ECCE /* main.m */, B96675F428CBF33D00C2ECCE /* FSCHMasterEditor.entitlements */, + B966F64928DD9E7B00BFA6D8 /* FSCHMasterEditor-Bridging-Header.h */, ); path = FSCHMasterEditor; sourceTree = ""; }; + B966760A28CC227500C2ECCE /* fonts */ = { + isa = PBXGroup; + children = ( + B966760428CC226C00C2ECCE /* IBMPlexMono-Bold.ttf */, + B966760628CC226C00C2ECCE /* IBMPlexMono-Light.ttf */, + B966760528CC226C00C2ECCE /* IBMPlexMono-Regular.ttf */, + ); + path = fonts; + sourceTree = ""; + }; + B966F63328DD7B4400BFA6D8 /* colorspacehandler */ = { + isa = PBXGroup; + children = ( + B966F63428DD7B4400BFA6D8 /* colorspacehandler_AVX512.cpp */, + B966F63528DD7B4400BFA6D8 /* colorspacehandler_SSE2.h */, + B966F63628DD7B4400BFA6D8 /* colorspacehandler_AVX512.h */, + B966F63728DD7B4400BFA6D8 /* colorspacehandler_AVX2.cpp */, + B966F63828DD7B4400BFA6D8 /* colorspacehandler_NEON.cpp */, + B966F63928DD7B4400BFA6D8 /* colorspacehandler_NEON.h */, + B966F63A28DD7B4400BFA6D8 /* colorspacehandler_AVX2.h */, + B966F63B28DD7B4400BFA6D8 /* colorspacehandler_SSE2.cpp */, + B966F63C28DD7B4400BFA6D8 /* colorspacehandler_AltiVec.h */, + B966F63D28DD7B4400BFA6D8 /* colorspacehandler.cpp */, + B966F63E28DD7B4400BFA6D8 /* colorspacehandler_AltiVec.cpp */, + B966F63F28DD7B4400BFA6D8 /* colorspacehandler.h */, + ); + path = colorspacehandler; + sourceTree = ""; + }; + B96CBA3C28D8299600FCC2CA /* CustomViews */ = { + isa = PBXGroup; + children = ( + B966F64C28DD9FA100BFA6D8 /* SyncScrollView.h */, + B966F64D28DD9FA100BFA6D8 /* SyncScrollView.m */, + B96CBA4228D82D4A00FCC2CA /* HexTextView.h */, + B96CBA4328D82D4A00FCC2CA /* HexTextView.m */, + B96CBA4428D82D4A00FCC2CA /* HexTextView.xib */, + B96CBA4728D8FE2B00FCC2CA /* AddressTextView.h */, + B96CBA4828D8FE2B00FCC2CA /* AddressTextView.m */, + B96CBA4928D8FE2B00FCC2CA /* AddressTextView.xib */, + B96CBA4C28DA671800FCC2CA /* ASCIITextView.h */, + B96CBA4D28DA671800FCC2CA /* ASCIITextView.m */, + B96CBA4E28DA671800FCC2CA /* ASCIITextView.xib */, + B949CB8228DA76C9002C81CD /* CollectionLayout.h */, + B949CB8328DA76C9002C81CD /* CollectionLayout.m */, + ); + path = CustomViews; + sourceTree = ""; + }; + B9A79A2228DFDE3C00966F34 /* nds */ = { + isa = PBXGroup; + children = ( + B966F63328DD7B4400BFA6D8 /* colorspacehandler */, + B966F64728DD7B9B00BFA6D8 /* retro_miscellaneous.h */, + B966F64628DD7B5A00BFA6D8 /* types.h */, + B966F64828DD7BAE00BFA6D8 /* retro_inline.h */, + B966760B28CC66EE00C2ECCE /* NDSCartridge.h */, + B966760F28CDAAE500C2ECCE /* NDSCartridgeData.h */, + B966761028CDAAE500C2ECCE /* NDSCartridgeData.m */, + B9A79A2428DFEEC100966F34 /* README.txt */, + ); + path = nds; + sourceTree = ""; + }; + B9A79A2328DFDE5000966F34 /* prefs */ = { + isa = PBXGroup; + children = ( + B92B235028D671D800A78F39 /* PreferencePane.h */, + B92B235128D671D800A78F39 /* PreferencePane.m */, + B92B235528D6744400A78F39 /* PreferencesManager.h */, + B92B235628D6744400A78F39 /* PreferencesManager.m */, + B92B235228D671D800A78F39 /* PreferencePane.xib */, + ); + path = prefs; + sourceTree = ""; + }; + B9A79A2628DFF07D00966F34 /* Frameworks */ = { + isa = PBXGroup; + children = ( + B9A79A2B28DFF08A00966F34 /* AppKit.framework */, + B9A79A2928DFF08600966F34 /* Cocoa.framework */, + B9A79A2728DFF07D00966F34 /* CryptoKit.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXNativeTarget section */ @@ -90,10 +281,11 @@ isa = PBXProject; attributes = { BuildIndependentTargetsInParallel = 1; - LastUpgradeCheck = 1340; + LastUpgradeCheck = 1400; TargetAttributes = { B96675E628CBF33B00C2ECCE = { CreatedOnToolsVersion = 13.4.1; + LastSwiftMigration = 1400; }; }; }; @@ -120,8 +312,19 @@ isa = PBXResourcesBuildPhase; buildActionMask = 2147483647; files = ( + B96CBA4B28D8FE2B00FCC2CA /* AddressTextView.xib in Resources */, + B966760328CBFA5900C2ECCE /* RHMainWindow.xib in Resources */, + B96675FE28CBF81000C2ECCE /* ROMHeaderViewer.xib in Resources */, + B9A79A2528DFEEC100966F34 /* README.txt in Resources */, + B92B235428D671D800A78F39 /* PreferencePane.xib in Resources */, B96675EE28CBF33D00C2ECCE /* Assets.xcassets in Resources */, + B966760728CC226C00C2ECCE /* IBMPlexMono-Bold.ttf in Resources */, + B96CBA4628D82D4A00FCC2CA /* HexTextView.xib in Resources */, + B96CBA5028DA671800FCC2CA /* ASCIITextView.xib in Resources */, + B966760828CC226C00C2ECCE /* IBMPlexMono-Regular.ttf in Resources */, + B9B4B4F628D57B6A00F629E1 /* ndsheaderspec.plist in Resources */, B96675F128CBF33D00C2ECCE /* MainMenu.xib in Resources */, + B966760928CC226C00C2ECCE /* IBMPlexMono-Light.ttf in Resources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -132,8 +335,22 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + B96CBA4A28D8FE2B00FCC2CA /* AddressTextView.m in Sources */, + B966F65128DE6DE100BFA6D8 /* PopoverInformationView.m in Sources */, + B966760228CBFA5900C2ECCE /* RHMainWindow.m in Sources */, B96675F328CBF33D00C2ECCE /* main.m in Sources */, + B966F64E28DD9FA100BFA6D8 /* SyncScrollView.m in Sources */, + B96675FD28CBF81000C2ECCE /* ROMHeaderViewer.m in Sources */, + B96CBA4528D82D4A00FCC2CA /* HexTextView.m in Sources */, + B9A79A2128DFDE3100966F34 /* NSString+WordCount.m in Sources */, B96675EC28CBF33B00C2ECCE /* AppDelegate.m in Sources */, + B92B235728D6744400A78F39 /* PreferencesManager.m in Sources */, + B949CB8428DA76C9002C81CD /* CollectionLayout.m in Sources */, + B966761128CDAAE500C2ECCE /* NDSCartridgeData.m in Sources */, + B966F64428DD7B4500BFA6D8 /* colorspacehandler.cpp in Sources */, + B96CBA4F28DA671800FCC2CA /* ASCIITextView.m in Sources */, + B92B235328D671D800A78F39 /* PreferencePane.m in Sources */, + B9B4B4F428D5786000F629E1 /* ColorGenerator.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -184,6 +401,7 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; @@ -242,6 +460,7 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; @@ -265,13 +484,20 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO; + CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = FSCHMasterEditor/FSCHMasterEditor.entitlements; + CODE_SIGN_IDENTITY = "-"; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; - CURRENT_PROJECT_VERSION = 1; + CURRENT_PROJECT_VERSION = 2; + DEAD_CODE_STRIPPING = YES; DEVELOPMENT_TEAM = GH664RVXEQ; ENABLE_HARDENED_RUNTIME = YES; GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_CFBundleDisplayName = "FSCH's Master Editor"; + INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools"; INFOPLIST_KEY_NSHumanReadableCopyright = ""; INFOPLIST_KEY_NSMainNibFile = MainMenu; INFOPLIST_KEY_NSPrincipalClass = NSApplication; @@ -283,6 +509,9 @@ PRODUCT_BUNDLE_IDENTIFIER = software.iwantmyapdemote.FSCHMasterEditor; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "FSCHMasterEditor/FSCHMasterEditor-Bridging-Header.h"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 5.0; }; name = Debug; }; @@ -291,13 +520,20 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO; + CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = FSCHMasterEditor/FSCHMasterEditor.entitlements; + CODE_SIGN_IDENTITY = "-"; + "CODE_SIGN_IDENTITY[sdk=macosx*]" = "Apple Development"; CODE_SIGN_STYLE = Automatic; COMBINE_HIDPI_IMAGES = YES; - CURRENT_PROJECT_VERSION = 1; + CURRENT_PROJECT_VERSION = 2; + DEAD_CODE_STRIPPING = YES; DEVELOPMENT_TEAM = GH664RVXEQ; ENABLE_HARDENED_RUNTIME = YES; GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_CFBundleDisplayName = "FSCH's Master Editor"; + INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools"; INFOPLIST_KEY_NSHumanReadableCopyright = ""; INFOPLIST_KEY_NSMainNibFile = MainMenu; INFOPLIST_KEY_NSPrincipalClass = NSApplication; @@ -309,6 +545,8 @@ PRODUCT_BUNDLE_IDENTIFIER = software.iwantmyapdemote.FSCHMasterEditor; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "FSCHMasterEditor/FSCHMasterEditor-Bridging-Header.h"; + SWIFT_VERSION = 5.0; }; name = Release; }; diff --git a/FSCHMasterEditor/AppDelegate.m b/FSCHMasterEditor/AppDelegate.m index 1550538..e648268 100644 --- a/FSCHMasterEditor/AppDelegate.m +++ b/FSCHMasterEditor/AppDelegate.m @@ -6,19 +6,29 @@ // #import "AppDelegate.h" +#import "ROMHeaderViewer.h" +#import "PreferencePane.h" +#import "PreferencesManager.h" -@interface AppDelegate () +@interface AppDelegate () { + PreferencePane *preferences; + PreferencesManager *preferencesManager; +} +@property (readwrite,strong) NSWindowController *windowController; -@property (strong) IBOutlet NSWindow *window; @end @implementation AppDelegate - (void)applicationDidFinishLaunching:(NSNotification *)aNotification { + self.windowController = [[ROMHeaderViewer alloc] init]; + [self.windowController showWindow:self]; + + preferencesManager = [PreferencesManager sharedManager]; + // Insert code here to initialize your application } - - (void)applicationWillTerminate:(NSNotification *)aNotification { // Insert code here to tear down your application } @@ -28,5 +38,9 @@ return YES; } +- (IBAction)openPrefsPane:(id)sender { + preferences = [[PreferencePane alloc] init]; + [preferences showWindow:nil]; +} @end diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/Contents.json b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/Contents.json index 3f00db4..64dc11e 100644 --- a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/Contents.json +++ b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -1,51 +1,61 @@ { "images" : [ { + "filename" : "icon_16x16.png", "idiom" : "mac", "scale" : "1x", "size" : "16x16" }, { + "filename" : "icon_16x16@2x.png", "idiom" : "mac", "scale" : "2x", "size" : "16x16" }, { + "filename" : "icon_32x32.png", "idiom" : "mac", "scale" : "1x", "size" : "32x32" }, { + "filename" : "icon_32x32@2x.png", "idiom" : "mac", "scale" : "2x", "size" : "32x32" }, { + "filename" : "icon_128x128.png", "idiom" : "mac", "scale" : "1x", "size" : "128x128" }, { + "filename" : "icon_128x128@2x.png", "idiom" : "mac", "scale" : "2x", "size" : "128x128" }, { + "filename" : "icon_256x256.png", "idiom" : "mac", "scale" : "1x", "size" : "256x256" }, { + "filename" : "icon_256x256@2x.png", "idiom" : "mac", "scale" : "2x", "size" : "256x256" }, { + "filename" : "icon_512x512.png", "idiom" : "mac", "scale" : "1x", "size" : "512x512" }, { + "filename" : "icon_512x512@2x.png", "idiom" : "mac", "scale" : "2x", "size" : "512x512" diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128.png new file mode 100644 index 0000000..47da9d9 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128@2x.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128@2x.png new file mode 100644 index 0000000..b033b58 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_128x128@2x.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16.png new file mode 100644 index 0000000..9285054 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16@2x.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16@2x.png new file mode 100644 index 0000000..7f9fac9 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_16x16@2x.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256.png new file mode 100644 index 0000000..b033b58 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256@2x.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256@2x.png new file mode 100644 index 0000000..ba3d82b Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_256x256@2x.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32.png new file mode 100644 index 0000000..7f9fac9 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32@2x.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32@2x.png new file mode 100644 index 0000000..50db425 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_32x32@2x.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512.png new file mode 100644 index 0000000..ba3d82b Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512.png differ diff --git a/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512@2x.png b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512@2x.png new file mode 100644 index 0000000..6295350 Binary files /dev/null and b/FSCHMasterEditor/Assets.xcassets/AppIcon.appiconset/icon_512x512@2x.png differ diff --git a/FSCHMasterEditor/Base.lproj/MainMenu.xib b/FSCHMasterEditor/Base.lproj/MainMenu.xib index 769719a..23d31c2 100644 --- a/FSCHMasterEditor/Base.lproj/MainMenu.xib +++ b/FSCHMasterEditor/Base.lproj/MainMenu.xib @@ -1,8 +1,8 @@ - + - - + + @@ -11,12 +11,8 @@ - - - - - - + + @@ -31,7 +27,11 @@ - + + + + + @@ -68,11 +68,6 @@ - - - - - @@ -97,546 +92,57 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Default - - - - - - - Left to Right - - - - - - - Right to Left - - - - - - - - - - - Default - - - - - - - Left to Right - - - - - - - Right to Left - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -649,12 +155,6 @@ - - - - - - @@ -680,16 +180,5 @@ - - - - - - - - - - - diff --git a/FSCHMasterEditor/ColorGenerator.h b/FSCHMasterEditor/ColorGenerator.h new file mode 100644 index 0000000..f1199d0 --- /dev/null +++ b/FSCHMasterEditor/ColorGenerator.h @@ -0,0 +1,19 @@ +// +// ColorGenerator.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/16/22. +// + +#import +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface ColorGenerator : NSObject + +- (NSColor *)colorFromGradient:(CGFloat)pos; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/ColorGenerator.m b/FSCHMasterEditor/ColorGenerator.m new file mode 100644 index 0000000..eba408b --- /dev/null +++ b/FSCHMasterEditor/ColorGenerator.m @@ -0,0 +1,30 @@ +// +// ColorGenerator.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/16/22. +// + +#import "ColorGenerator.h" +#import "PreferencesManager.h" + +@implementation ColorGenerator { + PreferencesManager *preferenceManager; + NSGradient *sharedGradient; +} + +- (instancetype)init { + if (self = [super init]) { + preferenceManager = [PreferencesManager sharedManager]; + CGFloat pos[5] = {0, 0.25, 0.5, 0.75, 1}; + sharedGradient = [[NSGradient alloc] initWithColors:preferenceManager.gradientColors atLocations:pos colorSpace:NSColorSpace.deviceRGBColorSpace]; + } + return self; +} + +- (NSColor *)colorFromGradient:(CGFloat)pos { + NSColor *ret = [sharedGradient interpolatedColorAtLocation:pos]; + return ret; +} + +@end diff --git a/FSCHMasterEditor/CustomViews/ASCIITextView.h b/FSCHMasterEditor/CustomViews/ASCIITextView.h new file mode 100644 index 0000000..e309b51 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/ASCIITextView.h @@ -0,0 +1,24 @@ +// +// ASCIITextView.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/20/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface ASCIITextView : NSCollectionViewItem + +@property NSData *rawData; + +@property (strong) IBOutlet NSTextField *ASCIIValueLabel; + +- (void)setFontColor:(NSColor *)color; + +@property BOOL isImportantByte; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/CustomViews/ASCIITextView.m b/FSCHMasterEditor/CustomViews/ASCIITextView.m new file mode 100644 index 0000000..f027e3e --- /dev/null +++ b/FSCHMasterEditor/CustomViews/ASCIITextView.m @@ -0,0 +1,66 @@ +// +// ASCIITextView.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/20/22. +// + +#import "ASCIITextView.h" + +@interface ASCIITextView () { + NSColor *fontColor; + NSColor *bgColor; +} + +@end + +@implementation ASCIITextView + +- (void)viewDidLoad { + [super viewDidLoad]; + self.ASCIIValueLabel = [[self view] viewWithTag:6]; + if (self.rawData != nil && self.ASCIIValueLabel != nil) { + NSString *asciiString = [NSString stringWithCString:[self.rawData bytes] encoding:NSASCIIStringEncoding]; + NSCharacterSet *whitespaceSet = [NSCharacterSet whitespaceAndNewlineCharacterSet]; + NSCharacterSet *controlSet = [NSCharacterSet controlCharacterSet]; + if ([asciiString rangeOfCharacterFromSet:whitespaceSet].location != NSNotFound || [asciiString rangeOfCharacterFromSet:controlSet].location != NSNotFound || [asciiString length] == 0) { + asciiString = @"."; + } + [self.ASCIIValueLabel setStringValue:asciiString]; + } + if (fontColor) + { + [[self ASCIIValueLabel] setTextColor:fontColor]; + } + self.view.wantsLayer = true; + // Do view setup here. +} + +- (NSNibName)nibName { + return @"ASCIITextView"; +} + +- (void)setFontColor:(NSColor *)color { + fontColor = color; + CGFloat r,g,b; + r = 1.0f - fontColor.redComponent; + g = 1.0f - fontColor.greenComponent; + b = 1.0f - fontColor.blueComponent; + bgColor = [NSColor colorWithRed:r green:g blue:b alpha:0.6f]; +} + +- (void)setSelected:(BOOL)selected { + [super setSelected:selected]; + if (selected && self.isImportantByte) { + [[[self view] layer] setBackgroundColor:bgColor.CGColor]; + } else { + [[[self view] layer] setBackgroundColor:[NSColor clearColor].CGColor]; + } +} + +- (BOOL)wantsUpdateLayer { + return YES; +} + + +@end diff --git a/FSCHMasterEditor/CustomViews/ASCIITextView.xib b/FSCHMasterEditor/CustomViews/ASCIITextView.xib new file mode 100644 index 0000000..150716e --- /dev/null +++ b/FSCHMasterEditor/CustomViews/ASCIITextView.xib @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/FSCHMasterEditor/CustomViews/AddressTextView.h b/FSCHMasterEditor/CustomViews/AddressTextView.h new file mode 100644 index 0000000..21dc4e9 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/AddressTextView.h @@ -0,0 +1,20 @@ +// +// AddressTextView.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/19/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface AddressTextView : NSCollectionViewItem + + +@property NSString *addressString; +@property (strong) IBOutlet NSTextField *addressLabel; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/CustomViews/AddressTextView.m b/FSCHMasterEditor/CustomViews/AddressTextView.m new file mode 100644 index 0000000..ba4ecb1 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/AddressTextView.m @@ -0,0 +1,25 @@ +// +// AddressTextView.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/19/22. +// + +#import "AddressTextView.h" + +@interface AddressTextView () + +@end + +@implementation AddressTextView + +- (void)viewDidLoad { + [super viewDidLoad]; + self.addressLabel = [[self view] viewWithTag:2]; + if (self.addressLabel != nil) { + [self.addressLabel setStringValue:self.addressString]; + } + // Do view setup here. +} + +@end diff --git a/FSCHMasterEditor/CustomViews/AddressTextView.xib b/FSCHMasterEditor/CustomViews/AddressTextView.xib new file mode 100644 index 0000000..56cd4b4 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/AddressTextView.xib @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/FSCHMasterEditor/CustomViews/CollectionLayout.h b/FSCHMasterEditor/CustomViews/CollectionLayout.h new file mode 100644 index 0000000..0fafcf4 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/CollectionLayout.h @@ -0,0 +1,16 @@ +// +// CollectionLayout.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/20/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface CollectionLayout : NSCollectionViewFlowLayout + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/CustomViews/CollectionLayout.m b/FSCHMasterEditor/CustomViews/CollectionLayout.m new file mode 100644 index 0000000..099fe70 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/CollectionLayout.m @@ -0,0 +1,50 @@ +// +// CollectionLayout.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/20/22. +// + +#import "CollectionLayout.h" + +@implementation CollectionLayout + +- (void)prepareLayout { + self.scrollDirection = NSCollectionViewScrollDirectionVertical; + self.minimumLineSpacing = 0; + self.minimumInteritemSpacing = 0; + if ([[[self collectionView] identifier] isEqualToString:@"addressView"]) { + self.itemSize = NSMakeSize(48, 16); + self.estimatedItemSize = NSMakeSize(48, 16); + self.sectionInset = NSEdgeInsetsMake(0, 0, 0, 0); + } else { + self.itemSize = NSMakeSize(16, 16); + self.estimatedItemSize = NSMakeSize(16, 16); + self.sectionInset = NSEdgeInsetsMake(0, 0, 0, 0); + } + +} + +- (BOOL)shouldInvalidateLayoutForBoundsChange:(CGRect)newBounds +{ + return NO; +} +/* +- (NSCollectionViewLayoutAttributes *)layoutAttributesForItemAtIndexPath:(NSIndexPath *)indexPath { + NSCollectionViewLayoutAttributes *attr = [[super layoutAttributesForItemAtIndexPath:indexPath] copy]; + if (![[[self collectionView] identifier] isEqualToString:@"addressView"]) { + int y = indexPath.item * 16; + int x = (indexPath.item / 24) * 16; + NSRect modifiedFrame = [attr frame]; + modifiedFrame.origin.x = floor(modifiedFrame.origin.x / (modifiedFrame.size.width + [self minimumInteritemSpacing])) * (modifiedFrame.size.width + [self minimumInteritemSpacing]); + [attr setFrame:modifiedFrame]; + } else { + NSRect modifiedFrame = [attr frame]; + modifiedFrame.origin.y = floor(modifiedFrame.origin.y / (modifiedFrame.size.height + [self minimumInteritemSpacing])) * (modifiedFrame.size.height + [self minimumInteritemSpacing]); + [attr setFrame:modifiedFrame]; + } + + return attr; +}*/ + +@end diff --git a/FSCHMasterEditor/CustomViews/HexTextView.h b/FSCHMasterEditor/CustomViews/HexTextView.h new file mode 100644 index 0000000..43de252 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/HexTextView.h @@ -0,0 +1,24 @@ +// +// HexTextView.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/18/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface HexTextView : NSCollectionViewItem + +@property NSData *rawData; + +@property (strong) IBOutlet NSTextField *hexValueLabel; + +- (void)setFontColor:(NSColor *)color; + +@property BOOL isImportantByte; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/CustomViews/HexTextView.m b/FSCHMasterEditor/CustomViews/HexTextView.m new file mode 100644 index 0000000..f33cad8 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/HexTextView.m @@ -0,0 +1,59 @@ +// +// HexTextView.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/18/22. +// + +#import "HexTextView.h" + +@interface HexTextView () { + NSColor *fontColor; + NSColor *bgColor; +} + +@end + +@implementation HexTextView + +- (void)viewDidLoad { + [super viewDidLoad]; + self.hexValueLabel = [[self view] viewWithTag:8]; + if (self.rawData != nil && self.hexValueLabel != nil) { + [self.hexValueLabel setStringValue:[NSString stringWithFormat:@"%02x", *(unsigned int*)[self.rawData bytes]]]; + } + if (fontColor) + { + [[self hexValueLabel] setTextColor:fontColor]; + } + self.view.wantsLayer = true; + // Do view setup here. +} + +- (void)setFontColor:(NSColor *)color { + fontColor = color; + CGFloat r,g,b; + r = 1.0f - fontColor.redComponent; + g = 1.0f - fontColor.greenComponent; + b = 1.0f - fontColor.blueComponent; + bgColor = [NSColor colorWithRed:r green:g blue:b alpha:0.6f]; +} + +- (NSNibName)nibName { + return @"HexTextView"; +} + +- (void)setSelected:(BOOL)selected { + [super setSelected:selected]; + if (selected && self.isImportantByte) { + [[[self view] layer] setBackgroundColor:bgColor.CGColor]; + } else { + [[[self view] layer] setBackgroundColor:[NSColor clearColor].CGColor]; + } +} + +- (BOOL)wantsUpdateLayer { + return YES; +} + +@end diff --git a/FSCHMasterEditor/CustomViews/HexTextView.xib b/FSCHMasterEditor/CustomViews/HexTextView.xib new file mode 100644 index 0000000..0afc981 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/HexTextView.xib @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/FSCHMasterEditor/CustomViews/SyncScrollView.h b/FSCHMasterEditor/CustomViews/SyncScrollView.h new file mode 100644 index 0000000..5e47a4b --- /dev/null +++ b/FSCHMasterEditor/CustomViews/SyncScrollView.h @@ -0,0 +1,22 @@ +// +// SyncScrollView.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/23/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface SyncScrollView : NSScrollView { + NSScrollView* synchronizedScrollView; // not retained +} + +- (void)setSynchronizedScrollView:(NSScrollView*)scrollview; +- (void)stopSynchronizing; +- (void)synchronizedViewContentBoundsDidChange:(NSNotification *)notification; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/CustomViews/SyncScrollView.m b/FSCHMasterEditor/CustomViews/SyncScrollView.m new file mode 100644 index 0000000..cab4965 --- /dev/null +++ b/FSCHMasterEditor/CustomViews/SyncScrollView.m @@ -0,0 +1,102 @@ +// +// SyncScrollView.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/23/22. +// + +#import "SyncScrollView.h" + +@implementation SyncScrollView + +- (instancetype)initWithFrame:(NSRect)frameRect { + if (self = [super initWithFrame:frameRect]) { + self.hasHorizontalScroller = FALSE; + self.hasVerticalScroller = FALSE; + } + return self; +} + +- (void)drawRect:(NSRect)dirtyRect { + [super drawRect:dirtyRect]; + + // Drawing code here. +} + +- (void)setSynchronizedScrollView:(NSScrollView*)scrollview +{ + NSView *synchronizedContentView; + + // stop an existing scroll view synchronizing + [self stopSynchronizing]; + + // don't retain the watched view, because we assume that it will + // be retained by the view hierarchy for as long as we're around. + synchronizedScrollView = scrollview; + + // get the content view of the + synchronizedContentView=[synchronizedScrollView contentView]; + + // Make sure the watched view is sending bounds changed + // notifications (which is probably does anyway, but calling + // this again won't hurt). + [synchronizedContentView setPostsBoundsChangedNotifications:YES]; + + // a register for those notifications on the synchronized content view. + [[NSNotificationCenter defaultCenter] addObserver:self + selector:@selector(synchronizedViewContentBoundsDidChange:) + name:NSViewBoundsDidChangeNotification + object:synchronizedContentView]; +} + +- (void)synchronizedViewContentBoundsDidChange:(NSNotification *)notification +{ + // get the changed content view from the notification + NSClipView *changedContentView=[notification object]; + + // get the origin of the NSClipView of the scroll view that + // we're watching + NSPoint changedBoundsOrigin = [changedContentView documentVisibleRect].origin;; + + // get our current origin + NSPoint curOffset = [[self contentView] bounds].origin; + NSPoint newOffset = curOffset; + + // scrolling is synchronized in the vertical plane + // so only modify the y component of the offset + newOffset.y = changedBoundsOrigin.y; + + // if our synced position is different from our current + // position, reposition our content view + if (!NSEqualPoints(curOffset, changedBoundsOrigin)) + { + // note that a scroll view watching this one will + // get notified here + [[self contentView] scrollToPoint:newOffset]; + // we have to tell the NSScrollView to update its + // scrollers + [self reflectScrolledClipView:[self contentView]]; + } +} + +- (void)stopSynchronizing +{ + if (synchronizedScrollView != nil) { + NSView* synchronizedContentView = [synchronizedScrollView contentView]; + + // remove any existing notification registration + [[NSNotificationCenter defaultCenter] removeObserver:self + name:NSViewBoundsDidChangeNotification + object:synchronizedContentView]; + + // set synchronizedScrollView to nil + synchronizedScrollView=nil; + } +} + +- (void)scrollWheel:(NSEvent *)event { + if ([[self identifier] isEqualToString:@"ASCIIScrollview"]) { + [super scrollWheel:event]; + } +} +@end diff --git a/FSCHMasterEditor/FSCHMasterEditor-Bridging-Header.h b/FSCHMasterEditor/FSCHMasterEditor-Bridging-Header.h new file mode 100644 index 0000000..1b2cb5d --- /dev/null +++ b/FSCHMasterEditor/FSCHMasterEditor-Bridging-Header.h @@ -0,0 +1,4 @@ +// +// Use this file to import your target's public headers that you would like to expose to Swift. +// + diff --git a/FSCHMasterEditor/FSCHMasterEditor.entitlements b/FSCHMasterEditor/FSCHMasterEditor.entitlements index f2ef3ae..19afff1 100644 --- a/FSCHMasterEditor/FSCHMasterEditor.entitlements +++ b/FSCHMasterEditor/FSCHMasterEditor.entitlements @@ -2,9 +2,9 @@ - com.apple.security.app-sandbox - - com.apple.security.files.user-selected.read-only - + com.apple.security.app-sandbox + + com.apple.security.files.user-selected.read-write + diff --git a/FSCHMasterEditor/NSString+WordCount.h b/FSCHMasterEditor/NSString+WordCount.h new file mode 100644 index 0000000..01ffb68 --- /dev/null +++ b/FSCHMasterEditor/NSString+WordCount.h @@ -0,0 +1,18 @@ +// +// NSString+WordCount.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/24/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface NSString (NSString_WordCount) + +- (NSUInteger)wordCount; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/NSString+WordCount.m b/FSCHMasterEditor/NSString+WordCount.m new file mode 100644 index 0000000..ec650bb --- /dev/null +++ b/FSCHMasterEditor/NSString+WordCount.m @@ -0,0 +1,24 @@ +// +// NSString+WordCount.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/24/22. +// + +#import "NSString+WordCount.h" + +@implementation NSString (NSString_WordCount) + +- (NSUInteger)wordCount { + NSCharacterSet *separators = [NSCharacterSet whitespaceAndNewlineCharacterSet]; + NSArray *words = [self componentsSeparatedByCharactersInSet:separators]; + + NSIndexSet *separatorIndexes = [words indexesOfObjectsPassingTest:^BOOL(id obj, NSUInteger idx, BOOL *stop) { + return [obj isEqualToString:@""]; + }]; + + return [words count] - [separatorIndexes count]; +} + + +@end diff --git a/FSCHMasterEditor/PopoverInformationView.h b/FSCHMasterEditor/PopoverInformationView.h new file mode 100644 index 0000000..f7d6192 --- /dev/null +++ b/FSCHMasterEditor/PopoverInformationView.h @@ -0,0 +1,23 @@ +// +// PopoverInformationView.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/23/22. +// + +#import + +NS_ASSUME_NONNULL_BEGIN + +@interface PopoverInformationView : NSView + +@property (strong) NSPopover *hostPopover; +@property (strong) IBOutlet NSTextField *titleLabel; +@property (strong) IBOutlet NSTextField *descriptionLabel; + +- (void)setPopoverData:(id)dat title:(NSString *)titleStr; +- (void)setShowingSHASUM:(NSString *)shaValue; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/PopoverInformationView.m b/FSCHMasterEditor/PopoverInformationView.m new file mode 100644 index 0000000..63d152d --- /dev/null +++ b/FSCHMasterEditor/PopoverInformationView.m @@ -0,0 +1,56 @@ +// +// PopoverInformationView.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/23/22. +// + +#import "PopoverInformationView.h" +#import "NSString+WordCount.h" + +@implementation PopoverInformationView + +- (void)drawRect:(NSRect)dirtyRect { + [super drawRect:dirtyRect]; + + // Drawing code here. +} + +- (BOOL)acceptsFirstResponder { + return TRUE; +} +- (BOOL)canBecomeKeyView { + return YES; +} + +- (void)keyDown:(NSEvent *)event { + // todo hide on esc, probably not this way. +} + +- (void)setPopoverData:(id)dat title:(NSString *)titleStr { + NSInteger words = [titleStr wordCount]; + if (words >= 4) { + [self.titleLabel setFont:[[NSFontManager sharedFontManager] fontWithFamily:@"IBM Plex Mono" traits:NSFontWeightBold weight:0 size:10]]; + } else if (words > 2) { + [self.titleLabel setFont:[NSFont fontWithName:@"IBM Plex Mono" size:12]]; + } else { + [self.titleLabel setFont:[NSFont fontWithName:@"IBM Plex Mono" size:14]]; + } + self.titleLabel.stringValue = titleStr; + + NSData *selectedData = dat[@"data"]; + NSString *dataFormatType = dat[@"type"]; + NSString *selectedValue = @""; + if ([dataFormatType isEqualToString:@"string"]) { + selectedValue = [NSString stringWithCString:(const char*)[selectedData bytes] encoding:NSASCIIStringEncoding]; + } else if ([dataFormatType isEqualToString:@"hex"]) { + selectedValue = [NSString stringWithFormat:@"0x%02x", *(unsigned int*)[selectedData bytes]]; + } + self.descriptionLabel.stringValue = [NSString stringWithFormat:@"OFFSET @ 0x%02X\nSIZE %i\n%@", [dat[@"offset"] intValue], [dat[@"size"] intValue], selectedValue]; +} + +- (void)setShowingSHASUM:(NSString *)shaValue { + +} + +@end diff --git a/FSCHMasterEditor/RHMainWindow.h b/FSCHMasterEditor/RHMainWindow.h new file mode 100644 index 0000000..38624f3 --- /dev/null +++ b/FSCHMasterEditor/RHMainWindow.h @@ -0,0 +1,49 @@ +// +// RHMainWindow.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/9/22. +// + +#import +#import "FSCHMasterEditor-Bridging-Header.h" +#import "SyncScrollView.h" + +NS_ASSUME_NONNULL_BEGIN + +@protocol RHMainWindowDelegate +@optional +- (void)didFinishLoadingROM; +@end + + +@interface RHMainWindow : NSWindowController +typedef void (^completionBlock)(bool param1); + +@property (nonatomic, weak) id delegate; + +@property (strong) IBOutlet NSTextField *headerChecksumValidationLabel; +@property (strong) IBOutlet NSImageView *headerChecksumValidationImage; + +@property (strong) IBOutlet NSCollectionView *addressCollectionView; +@property (strong) IBOutlet NSCollectionView *hexCollectionView; +@property (strong) IBOutlet NSCollectionView *asciiCollectionView; + +@property (strong) IBOutlet SyncScrollView *hexScrollView; +@property (strong) IBOutlet NSClipView *hexClipView; +@property (strong) IBOutlet SyncScrollView *asciiScrollView; +@property (strong) IBOutlet NSClipView *asciiClipView; +@property (strong) IBOutlet SyncScrollView *addressScrollview; +@property (strong) IBOutlet NSClipView *addressClipview; + +@property (strong) IBOutlet NSPopover *popoverView; +@property (strong) IBOutlet NSImageView *bannerIconImageView; + + +- (instancetype)initWithPathToROM:(NSURL *)romPath delegate:(id )delegate; +- (void)setROMPath:(NSURL *)path; +- (void)doReload; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/RHMainWindow.m b/FSCHMasterEditor/RHMainWindow.m new file mode 100644 index 0000000..9e432bf --- /dev/null +++ b/FSCHMasterEditor/RHMainWindow.m @@ -0,0 +1,395 @@ +// +// RHMainWindow.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/9/22. +// + +#import "RHMainWindow.h" +#import "NDSCartridge.h" +#import "NDSCartridgeData.h" +#import "ColorGenerator.h" +#import "HexTextView.h" +#import "AddressTextView.h" +#import "ASCIITextView.h" +#import "PopoverInformationView.h" +#include + +@implementation NSColor (NSColor_Random) + ++ (NSColor *)randomColor { + int r,g,b; + r = arc4random_uniform(255); + g = arc4random_uniform(255); + b = arc4random_uniform(255); + return [NSColor colorWithRed:r green:g blue:b alpha:1]; +} + +@end + +@implementation NSData (NSData_Conversion) + +#pragma mark - String Conversion +- (NSString *)hexadecimalString { + /* Returns hexadecimal string of NSData. Empty string if data is empty. */ + + const unsigned char *dataBuffer = (const unsigned char *)[self bytes]; + + if (!dataBuffer) + return [NSString string]; + + NSUInteger dataLength = [self length]; + NSMutableString *hexString = [NSMutableString stringWithCapacity:(dataLength * 2)]; + + for (int i = 0; i < dataLength; ++i) + [hexString appendString:[NSString stringWithFormat:@"%02lx ", (unsigned long)dataBuffer[i]]]; + + return [NSString stringWithString:hexString]; +} + +@end + +@interface RHMainWindow () { + NSURL *romFilePath; + NSFileHandle *romFileHandle; + NSMutableDictionary *romHeaderInfo; + NSMutableArray *addressesArray; + NDSCartridgeData *cartridgeData; + ColorGenerator *colorGen; + BOOL is3DSTypeROM; + BOOL isCollectionViewDataReady; + NSSet *previousSelectionSet; + int bannerFrame; + BOOL popoverIsShown; +} +@property (strong) IBOutlet NSTableView *tableView; + +@end + +@implementation RHMainWindow + +- (instancetype)initWithPathToROM:(NSURL *)romPath delegate:(id )ndelegate { + self = [super init]; + addressesArray = [[NSMutableArray alloc] init]; + colorGen = [[ColorGenerator alloc] init]; + romHeaderInfo = [[NSMutableDictionary alloc] init]; + isCollectionViewDataReady = false; + self.delegate = ndelegate; + [self setROMPath:romPath]; + return self; +} + +- (void)windowDidLoad { + [super windowDidLoad]; + [[self tableView] setDelegate:self]; + [[self tableView] setDataSource:self]; + [[self window] setTitle:[romFilePath lastPathComponent]]; + [[self addressScrollview] setHasVerticalScroller:FALSE]; + [[self hexScrollView] setHasVerticalScroller:FALSE]; + [[self addressScrollview] setSynchronizedScrollView:[self hexScrollView]]; + [[self hexScrollView] setSynchronizedScrollView:[self asciiScrollView]]; + // Implement this method to handle any initialization after your window controller's window has been loaded from its nib file. +} + +- (NSString *)windowNibName +{ + return @"RHMainWindow"; +} + +- (void)doReload { + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0), ^{ + [self processRomFile:^(bool param1) { + if (param1 && self.delegate != nil) { + dispatch_async(dispatch_get_main_queue(), ^{ + [self.delegate didFinishLoadingROM]; + //[[self hexCollectionView] reloadSections:[NSIndexSet indexSetWithIndex:0]]; + //[[self asciiCollectionView] reloadSections:[NSIndexSet indexSetWithIndex:0]]; + //[[self addressCollectionView] reloadSections:[NSIndexSet indexSetWithIndex:0]]; + [[self hexCollectionView] reloadData]; + [[self asciiCollectionView] reloadData]; + [[self addressCollectionView] reloadData]; + + [[self tableView] reloadData]; + NSImage *bannerIcon = [self->cartridgeData bannerIcon]; + if (self->cartridgeData.hasAnimatedBanner) { + self.bannerIconImageView.image = bannerIcon; + [NSTimer scheduledTimerWithTimeInterval:0.1f target:self selector:@selector(timerInterval:) userInfo:nil repeats:NO]; + } else { + self.bannerIconImageView.image = bannerIcon; + } + }); + } + }]; + }); + +} + +- (NSDictionary *)HeaderSpecificationDictionary { + NSString *specPath = [[NSBundle mainBundle] pathForResource:@"ndsheaderspec" ofType:@"plist"]; + return [NSDictionary dictionaryWithContentsOfFile:specPath]; +} + +- (IBAction)boundsDidChange:(NSNotification *)notification { + +} + +- (void)processTableView:(sNDSHeaderExt *)ndsHeader andCartridgeData:(NDSCartridgeData *)cartridgeData { + [romFileHandle seekToFileOffset:0]; + romHeaderInfo = [[NSMutableDictionary alloc] initWithDictionary:[self HeaderSpecificationDictionary]]; + for (int i = 0; i < [romHeaderInfo allKeys].count; i++) { + NSString *curKey = [romHeaderInfo allKeys][i]; + int curOffset = [[romHeaderInfo objectForKey:curKey][@"offset"] intValue]; + int readSize = [[romHeaderInfo objectForKey:curKey][@"size"] intValue]; + [romFileHandle seekToFileOffset:curOffset]; + NSData *fileBytes = [romFileHandle readDataOfLength:readSize]; + [[romHeaderInfo objectForKey:curKey] setObject:fileBytes forKey:@"data"]; + NSColor *dataColor = [colorGen colorFromGradient:(CGFloat)((CGFloat)i/[romHeaderInfo allKeys].count)]; + [[romHeaderInfo objectForKey:curKey] setObject:dataColor forKey:@"color"]; + } +} + +- (IBAction)timerInterval:(id)sender { + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + while (true) { + if (self->bannerFrame >= self->cartridgeData.bannerIconAnimationFramesMax) { + self->bannerFrame = 0; + } + NSNumber *duration = [self->cartridgeData bannerAnimation][self->bannerFrame].duration; + NSNumber *bannerBitmapIndex = [self->cartridgeData bannerAnimation][self->bannerFrame].bitmapIndex; + //NSLog(@"sleeping for icon animation"); + [NSThread sleepForTimeInterval:[duration doubleValue]/20]; + // update UI on the main thread + dispatch_async(dispatch_get_main_queue(), ^{ + self.bannerIconImageView.image = [[self->cartridgeData animatedBannerIcons] objectAtIndex:[bannerBitmapIndex unsignedIntValue]]; + }); + self->bannerFrame++; + } + }); + //[NSTimer scheduledTimerWithTimeInterval:0.5f target:self selector:@selector(timerInterval:) userInfo:nil repeats:NO]; +} + +- (void)processForHexView:(completionBlock)completion { + NSData *databuffer; + __block int length = 1024; + __block int offset = 0; + [romFileHandle seekToFileOffset:0]; + databuffer = [romFileHandle readDataOfLength:length]; + cartridgeData = [[NDSCartridgeData alloc] initWithHeaderData:databuffer fileHandle:romFileHandle]; + length = (int)[[self->cartridgeData romHeaderData] length]; + [self processTableView:[self->cartridgeData rawHeader] andCartridgeData:self->cartridgeData]; + while (length > offset ) { + //[addressFormatStr appendAttributedString:[[NSMutableAttributedString alloc] initWithString:[NSString stringWithFormat:@"%04x:\n", offset]]]; + [self->addressesArray addObject:[NSString stringWithFormat:@"%04x:", offset]]; + offset += 0x1A; + if (offset > length) { + break; + } + } + self->isCollectionViewDataReady = true; + [self->romFileHandle seekToFileOffset:0]; + completion(true); +} + +- (void)processRomFile:(completionBlock)returningBlock { + if (romFilePath == nil) returningBlock(false); + if ([[romFilePath path] length] <= 0) returningBlock(false); + + romFileHandle = [NSFileHandle fileHandleForReadingAtPath:[romFilePath path]]; + if (romFileHandle == nil) { + NSLog(@"Failed to open file"); + returningBlock(false); + } + + NSData *fileData = [romFileHandle readDataToEndOfFile]; + NSData *pattern = [@"NCCH" dataUsingEncoding:NSASCIIStringEncoding]; + NSRange range = [fileData rangeOfData:pattern options:0 range:NSMakeRange(0, fileData.length)]; + is3DSTypeROM = (range.location != NSNotFound); + [self processForHexView:^(bool param1) { + [self->romFileHandle closeFile]; + returningBlock(param1); + }]; +} + +- (void)setROMPath:(NSURL *)path +{ + romFilePath = path; +} + +- (NSInteger)numberOfRowsInTableView:(NSTableView *)tableView { + if (romHeaderInfo == nil) return 0; + NSInteger ret = [[romHeaderInfo allKeys] count]; + return ret; +} + +-(NSView *)tableView:(NSTableView *)tableView viewForTableColumn:(NSTableColumn *)tableColumn row:(NSInteger)row { + NSString *identifier = tableColumn.identifier; + NSTableCellView *cell = [tableView makeViewWithIdentifier:identifier owner:self]; + NSString *keyValue = [[romHeaderInfo allKeys] sortedArrayUsingSelector:@selector(localizedCaseInsensitiveCompare:)][row]; + if ([[tableColumn identifier] isEqualToString:@"descriptionCol"]) { + [cell textField].stringValue = keyValue; + } else { + NSString *dataFormatType = [romHeaderInfo objectForKey:keyValue][@"type"]; + NSData *cellData = [romHeaderInfo objectForKey:keyValue][@"data"]; + + if ([dataFormatType isEqualToString:@"string"]) { + [cell textField].stringValue = [NSString stringWithCString:(const char*)[cellData bytes] encoding:NSASCIIStringEncoding]; + } else if ([dataFormatType isEqualToString:@"hex"]) { + if ([keyValue isEqualToString:@"Header CRC-16"]) { + [cell textField].stringValue = [NSString stringWithFormat:@"0x%02x (recalcsum; 0x%02x)", *(unsigned int*)[cellData bytes], cartridgeData.RecalculatedHeaderCRC16]; + } else { + [cell textField].stringValue = [NSString stringWithFormat:@"0x%02x", *(unsigned int*)[cellData bytes]]; + } + } + [[cell textField] setTextColor:((NSColor *)[romHeaderInfo objectForKey:keyValue][@"color"])]; + + if ([keyValue isEqualToString:@"Header CRC-16"]) { + [cell imageView].image = cartridgeData.RecalcChecksumMatchesHeader ? [NSImage imageWithSystemSymbolName:@"checkmark" accessibilityDescription:@"CRC16 Matched"] : [NSImage imageWithSystemSymbolName:@"x.circle.fill" accessibilityDescription:@"CRC16 Check Failed"]; + } else { + // info + [cell imageView].image = [NSImage new]; + } + + } + return cell; +} + +- (NSCollectionViewItem *)collectionView:(nonnull NSCollectionView *)collectionView itemForRepresentedObjectAtIndexPath:(nonnull NSIndexPath *)indexPath { + if (!isCollectionViewDataReady) { + return nil; + } + if ([[collectionView identifier] isEqualToString:@"addressView"]) { + AddressTextView *newItem = [[AddressTextView alloc] initWithNibName:@"AddressTextView" bundle:[NSBundle mainBundle]]; + newItem.addressString = addressesArray[indexPath.item]; + return newItem; + } + + NSColor *selectedColor; + for (int i = 0; i < [romHeaderInfo allKeys].count; i++) { + int offset = [[romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]][@"offset"] intValue]; + int size = [[romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]][@"size"] intValue]; + int maxOffset = offset + size; + if (indexPath.item >= offset && indexPath.item < maxOffset) { + selectedColor = (NSColor *)[romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]][@"color"]; + break; + } + } + + if ([[collectionView identifier] isEqualToString:@"ASCIIView"]) { + ASCIITextView *newItem = [[ASCIITextView alloc] initWithNibName:@"ASCIITextView" bundle:[NSBundle mainBundle]]; + if (cartridgeData.romHeaderData != nil) { + //unsigned char *romHeader = (unsigned char *)[cartridgeData.romHeaderData bytes]; + uint16_t rawBytes; + [cartridgeData.romHeaderData getBytes:&rawBytes range:NSMakeRange(indexPath.item, 1)]; + [newItem setRawData:[NSData dataWithBytes:&rawBytes length:1]]; + if (selectedColor) { + [newItem setFontColor:selectedColor]; + newItem.isImportantByte = true; + } + } + return newItem; + } else { + HexTextView *newItem = [[HexTextView alloc] initWithNibName:@"HexTextView" bundle:[NSBundle mainBundle]]; + if (cartridgeData.romHeaderData != nil) { + //unsigned char *romHeader = (unsigned char *)[cartridgeData.romHeaderData bytes]; + uint16_t rawBytes; + [cartridgeData.romHeaderData getBytes:&rawBytes range:NSMakeRange(indexPath.item, 1)]; + [newItem setRawData:[NSData dataWithBytes:&rawBytes length:1]]; + if (selectedColor) { + [newItem setFontColor:selectedColor]; + newItem.isImportantByte = true; + } + } + return newItem; + } + return nil; +} + +- (NSInteger)collectionView:(nonnull NSCollectionView *)collectionView numberOfItemsInSection:(NSInteger)section { + if ([[collectionView identifier] isEqualToString:@"addressView"]) { + return [addressesArray count]; + } else { + if (cartridgeData != nil) { + if (cartridgeData.romHeaderData != nil) { + return [cartridgeData.romHeaderData length]; + } + } + } + return 0; +} + +- (void)collectionView:(NSCollectionView *)collectionView didSelectItemsAtIndexPaths:(NSSet *)indexPaths { + if (previousSelectionSet) { + [self.asciiCollectionView deselectItemsAtIndexPaths:[self.asciiCollectionView indexPathsForVisibleItems]]; + [self.hexCollectionView deselectItemsAtIndexPaths:[self.hexCollectionView indexPathsForVisibleItems]]; + } + previousSelectionSet = indexPaths; + NSMutableArray *additonalBytestoSelect = [[NSMutableArray alloc] init]; + [indexPaths enumerateObjectsUsingBlock:^(NSIndexPath * _Nonnull path, BOOL * _Nonnull stop) { + id item = [collectionView itemAtIndexPath:path]; + if ([item isKindOfClass:[ASCIITextView class]] || [item isKindOfClass:[HexTextView class]]) { + NSDictionary *selectedData; + NSString *selectedTitle; + for (int i = 0; i < [romHeaderInfo allKeys].count; i++) { + int offset = [[romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]][@"offset"] intValue]; + int size = [[romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]][@"size"] intValue]; + int maxOffset = offset + size; + if (path.item >= offset && path.item < maxOffset) { + selectedData = [romHeaderInfo objectForKey:[romHeaderInfo allKeys][i]]; + selectedTitle = [romHeaderInfo allKeys][i]; + NSInteger offset = [selectedData[@"offset"] integerValue]; + for (int sz = 0; sz < [selectedData[@"size"] integerValue]; sz++) { + NSIndexPath *newPath = [NSIndexPath indexPathForItem:(offset+sz) inSection:0]; + [additonalBytestoSelect addObject:newPath]; + } + + break; + } + } + if (selectedData) { + [(PopoverInformationView *)[[self.popoverView contentViewController] view] setPopoverData:selectedData title:selectedTitle]; + [self.popoverView showRelativeToRect:[collectionView frameForItemAtIndex:path.item] ofView:collectionView preferredEdge:NSRectEdgeMinY]; + popoverIsShown = true; + } else { + if (popoverIsShown) { + [self.popoverView performClose:self]; + } + } + } + }]; + NSSet *uniqueMakes = [NSSet setWithArray:additonalBytestoSelect]; + [indexPaths setByAddingObjectsFromSet:uniqueMakes]; + [self.hexCollectionView setSelectionIndexPaths:uniqueMakes]; + [self.asciiCollectionView setSelectionIndexPaths:uniqueMakes]; +} + +- (void)keyDown:(NSEvent *)event { + +} +- (void)encodeWithCoder:(nonnull NSCoder *)coder { + +} + +- (IBAction)copy:(id)sender { + [[NSPasteboard generalPasteboard] clearContents]; + NSInteger selectedTableRow = [[self tableView] selectedRow]; + NSString *selectedTableKey = [[romHeaderInfo allKeys] sortedArrayUsingSelector:@selector(localizedCaseInsensitiveCompare:)][selectedTableRow]; + NSData *selectedData = romHeaderInfo[selectedTableKey][@"data"]; + NSString *dataFormatType = romHeaderInfo[selectedTableKey][@"type"]; + NSString *selectedValue = @""; + if ([dataFormatType isEqualToString:@"string"]) { + selectedValue = [NSString stringWithCString:(const char*)[selectedData bytes] encoding:NSASCIIStringEncoding]; + } else if ([dataFormatType isEqualToString:@"hex"]) { + selectedValue = [NSString stringWithFormat:@"0x%02x", *(unsigned int*)[selectedData bytes]]; + } + + [[NSPasteboard generalPasteboard] setString:selectedValue forType:NSPasteboardTypeString]; +} + +- (IBAction)showROMFingerprint:(id)sender { + NSToolbarItem *btn = (NSToolbarItem *)sender; + [(PopoverInformationView *)[[self.popoverView contentViewController] view] setShowingSHASUM:[cartridgeData ROMSHA256]]; + [self.popoverView showRelativeToRect:[btn view].frame ofView:[self window].contentView preferredEdge:NSRectEdgeMinY]; + popoverIsShown = true; +} + +@end diff --git a/FSCHMasterEditor/RHMainWindow.xib b/FSCHMasterEditor/RHMainWindow.xib new file mode 100644 index 0000000..6ee86fe --- /dev/null +++ b/FSCHMasterEditor/RHMainWindow.xib @@ -0,0 +1,315 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/FSCHMasterEditor/ROMHeaderViewer.h b/FSCHMasterEditor/ROMHeaderViewer.h new file mode 100644 index 0000000..6cf1330 --- /dev/null +++ b/FSCHMasterEditor/ROMHeaderViewer.h @@ -0,0 +1,17 @@ +// +// ROMHeaderViewer.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/9/22. +// + +#import +#import "RHMainWindow.h" + +NS_ASSUME_NONNULL_BEGIN + +@interface ROMHeaderViewer : NSWindowController +@property (strong) IBOutlet NSProgressIndicator *progressView; +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/ROMHeaderViewer.m b/FSCHMasterEditor/ROMHeaderViewer.m new file mode 100644 index 0000000..0f62303 --- /dev/null +++ b/FSCHMasterEditor/ROMHeaderViewer.m @@ -0,0 +1,77 @@ +// +// ROMHeaderViewer.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/9/22. +// + +#import "ROMHeaderViewer.h" + +@interface ROMHeaderViewer () { + RHMainWindow *romheaderView; +} + +@property (strong) IBOutlet NSTextField *filenameLabel; + +@end + +@implementation ROMHeaderViewer + +- (void)windowDidLoad { + [super windowDidLoad]; + [[self window] registerForDraggedTypes:[NSArray arrayWithObjects:NSPasteboardTypeFileURL, nil]]; + // Implement this method to handle any initialization after your window controller's window has been loaded from its nib file. +} + +- (NSString *)windowNibName +{ + return @"ROMHeaderViewer"; +} + +- (NSDragOperation)draggingEntered:(id < NSDraggingInfo >)sender +{ + return NSDragOperationGeneric; +} +- (BOOL)prepareForDragOperation:(id < NSDraggingInfo >)sender +{ + NSPasteboard* pbrd = [sender draggingPasteboard]; + // Do something here. + NSURL *fileURL = [NSURL URLWithString:[pbrd stringForType:NSPasteboardTypeFileURL]]; + [self.filenameLabel setStringValue:[pbrd stringForType:NSPasteboardTypeFileURL]]; + romheaderView = [[RHMainWindow alloc] initWithPathToROM:fileURL delegate:self]; + [self.progressView startAnimation:nil]; + [self.progressView setHidden:FALSE]; + [romheaderView doReload]; + [[self window] orderOut:self]; + return YES; +} + +- (IBAction)openFileDialog:(id)sender { + NSOpenPanel* openDlg = [NSOpenPanel openPanel]; + [openDlg setCanChooseFiles:YES]; + [openDlg setAllowsMultipleSelection:NO]; + [openDlg setCanChooseDirectories:NO]; + + if ([openDlg runModal] == NSModalResponseOK ) + { + NSURL* fileURL = [openDlg URL]; + if (fileURL != nil) + { + [self.filenameLabel setStringValue:[fileURL lastPathComponent]]; + romheaderView = [[RHMainWindow alloc] initWithPathToROM:fileURL delegate:self]; + [self.progressView startAnimation:nil]; + [self.progressView setHidden:FALSE]; + [romheaderView doReload]; + [[self window] orderOut:self]; + } + } +} + +- (void)didFinishLoadingROM { + if (romheaderView == nil) return; + [romheaderView showWindow:nil]; + [self.progressView stopAnimation:nil]; + [self.progressView setHidden:TRUE]; +} + +@end diff --git a/FSCHMasterEditor/ROMHeaderViewer.xib b/FSCHMasterEditor/ROMHeaderViewer.xib new file mode 100644 index 0000000..498ccd6 --- /dev/null +++ b/FSCHMasterEditor/ROMHeaderViewer.xib @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/FSCHMasterEditor/fonts/IBMPlexMono-Bold.ttf b/FSCHMasterEditor/fonts/IBMPlexMono-Bold.ttf new file mode 100644 index 0000000..2ad2fa1 Binary files /dev/null and b/FSCHMasterEditor/fonts/IBMPlexMono-Bold.ttf differ diff --git a/FSCHMasterEditor/fonts/IBMPlexMono-Light.ttf b/FSCHMasterEditor/fonts/IBMPlexMono-Light.ttf new file mode 100644 index 0000000..88cbd9b Binary files /dev/null and b/FSCHMasterEditor/fonts/IBMPlexMono-Light.ttf differ diff --git a/FSCHMasterEditor/fonts/IBMPlexMono-Regular.ttf b/FSCHMasterEditor/fonts/IBMPlexMono-Regular.ttf new file mode 100644 index 0000000..93331e2 Binary files /dev/null and b/FSCHMasterEditor/fonts/IBMPlexMono-Regular.ttf differ diff --git a/FSCHMasterEditor/nds/NDSCartridge.h b/FSCHMasterEditor/nds/NDSCartridge.h new file mode 100644 index 0000000..c7992ef --- /dev/null +++ b/FSCHMasterEditor/nds/NDSCartridge.h @@ -0,0 +1,131 @@ +// +// NDSCartridge.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/9/22. +// + +#ifndef NDSCartridge_h +#define NDSCartridge_h + +/*! + \brief the NDS file header format + See gbatek for more info. +*/ +typedef struct { + char gameTitle[12]; //!< 12 characters for the game title. + char gameCode[4]; //!< 4 characters for the game code. + char makercode[2]; //!< identifies the (commercial) developer. + uint8_t unitCode; //!< identifies the required hardware. + uint8_t deviceType; //!< type of device in the game card + uint8_t deviceSize; //!< capacity of the device (1 << n Mbit) + uint8_t reserved1[9]; + uint8_t romversion; //!< version of the ROM. + uint8_t flags; //!< bit 2: auto-boot flag. + + uint32_t arm9romOffset; //!< offset of the arm9 binary in the nds file. + uint32_t arm9executeAddress; //!< adress that should be executed after the binary has been copied. + uint32_t arm9destination; //!< destination address to where the arm9 binary should be copied. + uint32_t arm9binarySize; //!< size of the arm9 binary. + + uint32_t arm7romOffset; //!< offset of the arm7 binary in the nds file. + uint32_t arm7executeAddress; //!< adress that should be executed after the binary has been copied. + uint32_t arm7destination; //!< destination address to where the arm7 binary should be copied. + uint32_t arm7binarySize; //!< size of the arm7 binary. + + uint32_t filenameOffset; //!< File Name Table (FNT) offset. + uint32_t filenameSize; //!< File Name Table (FNT) size. + uint32_t fatOffset; //!< File Allocation Table (FAT) offset. + uint32_t fatSize; //!< File Allocation Table (FAT) size. + + uint32_t arm9overlaySource; //!< File arm9 overlay offset. + uint32_t arm9overlaySize; //!< File arm9 overlay size. + uint32_t arm7overlaySource; //!< File arm7 overlay offset. + uint32_t arm7overlaySize; //!< File arm7 overlay size. + + uint32_t cardControl13; //!< Port 40001A4h setting for normal commands (used in modes 1 and 3) + uint32_t cardControlBF; //!< Port 40001A4h setting for KEY1 commands (used in mode 2) + uint32_t bannerOffset; //!< offset to the banner with icon and titles etc. + + uint16 secureCRC16; //!< Secure Area Checksum, CRC-16. + + uint16 readTimeout; //!< Secure Area Loading Timeout. + + uint32_t unknownRAM1; //!< ARM9 Auto Load List RAM Address (?) + uint32_t unknownRAM2; //!< ARM7 Auto Load List RAM Address (?) + + uint32_t bfPrime1; //!< Secure Area Disable part 1. + uint32_t bfPrime2; //!< Secure Area Disable part 2. + uint32_t romSize; //!< total size of the ROM. + + uint32_t headerSize; //!< ROM header size. + uint32_t zeros88[3]; + uint16 nandRomEnd; //!< ROM region end for NAND games. + uint16 nandRwStart; //!< RW region start for NAND games. + uint32_t zeros98[10]; + uint8_t gbaLogo[156]; //!< Nintendo logo needed for booting the game. + uint16 logoCRC16; //!< Nintendo Logo Checksum, CRC-16. + uint16 headerCRC16; //!< header checksum, CRC-16. + + uint32_t debugRomSource; //!< debug ROM offset. + uint32_t debugRomSize; //!< debug size. + uint32_t debugRomDestination; //!< debug RAM destination. + uint32_t offset_0x16C; //reserved? + + uint8_t zero[0x40]; + uint32_t region; + uint32_t accessControl; + uint32_t arm7SCFGSettings; + uint16 dsi_unk1; + uint8_t dsi_unk2; + uint8_t dsi_flags; + + uint32_t arm9iromOffset; //!< offset of the arm9 binary in the nds file. + uint32_t arm9iexecuteAddress; + uint32_t arm9idestination; //!< destination address to where the arm9 binary should be copied. + uint32_t arm9ibinarySize; //!< size of the arm9 binary. + + uint32_t arm7iromOffset; //!< offset of the arm7 binary in the nds file. + uint32_t deviceListDestination; + uint32_t arm7idestination; //!< destination address to where the arm7 binary should be copied. + uint32_t arm7ibinarySize; //!< size of the arm7 binary. + + uint8_t zero2[0x20]; + + // 0x200 + // TODO: More DSi-specific fields. + uint32_t dsi1[0x10/4]; + uint32_t twlRomSize; + uint32_t dsi_unk3; + uint32_t dsi_unk4; + uint32_t dsi_unk5; + uint8_t dsi2[0x10]; + uint32_t dsi_tid; + uint32_t dsi_tid2; + uint32_t pubSavSize; + uint32_t prvSavSize; + uint8_t dsi3[0x174]; +} sNDSHeaderExt; + +/*! + \brief the NDS banner format. + See gbatek for more information. +*/ +typedef struct { + uint16_t version; //!< version of the banner. + uint16_t crc[4]; //!< CRC-16s of the banner. + uint8_t reserved[22]; + uint8_t icon[512]; //!< 32*32 icon of the game with 4 bit per pixel. + uint16_t palette[16]; //!< the palette of the icon. + uint16_t titles[8][128]; //!< title of the game in 8 different languages. + + // [0xA40] Reserved space, possibly for other titles. + uint8_t reserved2[0x800]; + + // DSi-specific. + uint8_t dsi_icon[8][512]; //!< DSi animated icon frame data. + uint16_t dsi_palette[8][16]; //!< Palette for each DSi icon frame. + uint16_t dsi_seq[64]; //!< DSi animated icon sequence. +} sNDSBannerExt; + +#endif /* NDSCartridge_h */ diff --git a/FSCHMasterEditor/nds/NDSCartridgeData.h b/FSCHMasterEditor/nds/NDSCartridgeData.h new file mode 100644 index 0000000..644c1be --- /dev/null +++ b/FSCHMasterEditor/nds/NDSCartridgeData.h @@ -0,0 +1,87 @@ +// +// NDSCartridgeData.h +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/10/22. +// + +#import +#import "NDSCartridge.h" + +NS_ASSUME_NONNULL_BEGIN + +@interface NDSBannerAnimation : NSObject + +@property BOOL FlipVertial; +@property BOOL FlipHorizontal; + +@property (strong) NSNumber *duration; +@property (strong) NSNumber *bitmapIndex; + + +@end + +@interface NDSCartridgeData : NSObject { + sNDSHeaderExt header; + sNDSBannerExt banner; +} + + +- (instancetype)initWithHeaderData:(NSData *)databuffer fileHandle:(nonnull NSFileHandle *)handle; +- (sNDSHeaderExt *)rawHeader; +- (sNDSBannerExt *)banner; + +@property (strong) NSString *GameTitle; +@property (strong) NSString *GameCode; +@property (strong) NSString *Makercode; +@property (strong) NSString *ROMSHA256; + +@property (strong) NSData *romHeaderData; +@property (strong) NSImage *bannerIcon; +@property (strong) NSArray* animatedBannerIcons; +@property (strong) NSArray* bannerAnimation; + +@property int bannerIconAnimationFramesMax; + + + +@property UInt8 UnitCode; +@property UInt8 DeviceType; +@property UInt8 DeviceSize; +//@property UInt8 reserved1[9]; +@property UInt8 RomVersion; +@property UInt8 Flags; + +@property UInt32 HeaderSize; +@property UInt32 RomSize; + + +@property UInt32 Arm9romOffset; +@property UInt32 Arm9executeAddress; +@property UInt32 Arm9destination; +@property UInt32 Arm9binarySize; + +@property UInt32 Arm7romOffset; +@property UInt32 Arm7executeAddress; +@property UInt32 Arm7destination; +@property UInt32 Arm7binarySize; + +@property UInt16 HeaderCRC16; +@property UInt16 LogoCRC16; +@property UInt16 SecureCRC16; + +@property UInt32 BannerOffset; + +@property UInt16 RecalculatedHeaderCRC16; +@property BOOL RecalcChecksumMatchesHeader; + +@property UInt16 RecalculatedSecureAreaCRC16; +@property BOOL RecalcSecureAreaChecksumMatchesHeader; + +@property BOOL TWLAreasFound; + +@property BOOL hasAnimatedBanner; + +@end + +NS_ASSUME_NONNULL_END diff --git a/FSCHMasterEditor/nds/NDSCartridgeData.m b/FSCHMasterEditor/nds/NDSCartridgeData.m new file mode 100644 index 0000000..243683b --- /dev/null +++ b/FSCHMasterEditor/nds/NDSCartridgeData.m @@ -0,0 +1,313 @@ +// +// NDSCartridgeData.m +// FSCHMasterEditor +// +// Created by Dylan Laws on 9/10/22. +// +#import +#import "NDSCartridgeData.h" +#import "PreferencesManager.h" +#include "colorspacehandler.h" +#include + +#define ROM_ICON_WIDTH 32 +#define ROM_ICON_HEIGHT 32 + +static uint16_t MODBUS_CRC16_v1( const unsigned char *buf, unsigned int len ) +{ + uint16_t crc = 0xFFFF; + char i = 0; + + while(len--) + { + crc ^= (*buf++); + + for(i = 0; i < 8; i++) + { + if( crc & 1 ) + { + crc >>= 1; + crc ^= 0xA001; + } + else + { + crc >>= 1; + } + } + } + return crc; +} + +@implementation NDSBannerAnimation + +- (instancetype)initWithDuration:(float)duration bitmapIndex:(int)index flipVertical:(BOOL)flipV flipHorizontal:(BOOL)flipH { + if (self = [super init]) { + self.duration = [NSNumber numberWithFloat:duration]; + self.bitmapIndex = [NSNumber numberWithInt:index]; + self.FlipVertial = flipV; + self.FlipHorizontal = flipH; + } + return self; +} +@end + +@implementation NDSCartridgeData + +- (NSString *)sha256HashFor:(NSFileHandle *)inputFile +{ + [inputFile seekToFileOffset:0]; + CC_SHA256_CTX ctx; + CC_SHA256_Init(&ctx); + + [inputFile seekToEndOfFile]; + int totalSize = (int)[inputFile offsetInFile]; + int offset = 0; + while (offset < totalSize) { + [inputFile seekToFileOffset:offset]; + CC_SHA256_Update(&ctx, [[inputFile readDataOfLength:16] bytes], 16); + offset += 16; + } + unsigned char bytes[CC_SHA256_DIGEST_LENGTH]; + CC_SHA256_Final(bytes, &ctx); + NSMutableString *ret = [NSMutableString stringWithCapacity:CC_SHA256_DIGEST_LENGTH*2]; + for(int i = 0; iheader length:sizeof(sNDSHeaderExt)]; + unsigned char *headerBuf = (unsigned char *)malloc(romHeaderBytesToRead); + [databuffer getBytes:headerBuf length:romHeaderBytesToRead]; + self.RecalculatedHeaderCRC16 = MODBUS_CRC16_v1(headerBuf, 0x15e); + free(headerBuf); + + [handle seekToFileOffset:0x0]; + self.romHeaderData = [handle readDataOfLength:romHeaderBytesToRead]; + [handle seekToFileOffset:self->header.bannerOffset]; + self->banner = *(sNDSBannerExt *)[[handle readDataOfLength:sizeof(sNDSBannerExt)] bytes]; + + //NSLog(@"%@", [[[NSData alloc] initWithBytes:&banner.dsi_icon[0] length:512] base64EncodedStringWithOptions:NSDataBase64EncodingEndLineWithCarriageReturn]); + self.bannerIcon = [self getBannerIcon]; + if (self->banner.version >= 0x0103) { + self.animatedBannerIcons = [self getAnimatedBannerIcons]; + self.hasAnimatedBanner = true; + } + + NSMutableArray *animationArray = [[NSMutableArray alloc] init]; + for (int z = 0; z < 64; z++) { + uint16_t frame1 = self->banner.dsi_seq[z]; + unsigned short frameLen = frame1 & 0xFF; + unsigned short bitmapIndex = (frame1 >> 8) & 7; + if (frameLen == 0) { + self.bannerIconAnimationFramesMax = z; + break; + } + [animationArray addObject:[[NDSBannerAnimation alloc] initWithDuration:frameLen bitmapIndex:bitmapIndex flipVertical:NO flipHorizontal:NO]]; + } + self.bannerAnimation = [animationArray copy]; + + [handle seekToFileOffset:0x230]; + NSData *twlEmagcode = [handle readDataOfLength:4]; + NSString *twlEmagcodeStr = [NSString stringWithCString:(const char*)[twlEmagcode bytes] encoding:NSASCIIStringEncoding]; + if ([twlEmagcodeStr length] > 0) { + NSLog(@"found twl emagcode"); + //[handle seekToFileOffset:0x1BF]; + //NSData *twlFileflags = [handle readDataOfLength:1]; + //unsigned int fileFlags = *(unsigned int *)[twlFileflags bytes]; + } + NSLog(@"RecalculatedHeaderCRC16: %x", self.RecalculatedHeaderCRC16); + NSLog(@"header crc: %x", self->header.headerCRC16); + + NSLog(@"RecalculatedSecureAreaCRC16: %x", self.RecalculatedSecureAreaCRC16); + NSLog(@"secureCRC16: %x", self->header.secureCRC16); + + self.RecalcChecksumMatchesHeader = (self.RecalculatedHeaderCRC16 == self->header.headerCRC16); + if (!self.RecalcChecksumMatchesHeader) { + NSLog(@"checksum mismatch!"); + } + + self.GameTitle = [NSString stringWithCString:self->header.gameTitle encoding:NSASCIIStringEncoding]; + self.GameCode = [NSString stringWithCString:self->header.gameCode encoding:NSASCIIStringEncoding]; + self.Makercode = [NSString stringWithCString:self->header.makercode encoding:NSASCIIStringEncoding]; + + self.UnitCode = self->header.unitCode; + self.DeviceType = self->header.deviceType; + self.DeviceSize = self->header.deviceSize; + self.RomVersion = self->header.romversion; + self.Flags = self->header.flags; + + self.HeaderSize = self->header.headerSize; + self.RomSize = self->header.romSize; + + self.Arm9romOffset = self->header.arm9romOffset; + self.Arm9executeAddress = self->header.arm9executeAddress; + self.Arm9destination = self->header.arm9destination; + self.Arm9binarySize = self->header.arm9binarySize; + + self.Arm7romOffset = self->header.arm7romOffset; + self.Arm7executeAddress = self->header.arm7executeAddress; + self.Arm7destination = self->header.arm7destination; + self.Arm7binarySize = self->header.arm7binarySize; + + self.HeaderCRC16 = self->header.headerCRC16; + self.SecureCRC16 = self->header.secureCRC16; + self.LogoCRC16 = self->header.logoCRC16; + + self.BannerOffset = self->header.bannerOffset; + return self; +} + +- (sNDSHeaderExt *)rawHeader { + return &header; +} + +- (sNDSBannerExt *)banner { + return &banner; +} + +- (NSImage *)getBannerIcon { + NSImage *newImage = nil; + + newImage = [[NSImage alloc] initWithSize:NSMakeSize(32, 32)]; + if(newImage == nil) + { + return newImage; + } + + NSBitmapImageRep *imageRep = [[NSBitmapImageRep alloc] initWithBitmapDataPlanes:NULL + pixelsWide:ROM_ICON_WIDTH + pixelsHigh:ROM_ICON_HEIGHT + bitsPerSample:8 + samplesPerPixel:4 + hasAlpha:YES + isPlanar:NO + colorSpaceName:NSCalibratedRGBColorSpace + bytesPerRow:ROM_ICON_WIDTH * 4 + bitsPerPixel:32]; + + if(imageRep == nil) + { + newImage = nil; + return newImage; + } + + uint32_t *bitmapData = (uint32_t *)[imageRep bitmapData]; + [self RomIconToRGBA8888:bitmapData iconPixelData:(uint32_t *)banner.icon paletteData:(uint16_t *)banner.palette]; + + [newImage addRepresentation:imageRep]; + return newImage; +} + +- (NSArray *)getAnimatedBannerIcons { + NSMutableArray *imageArray = [[NSMutableArray alloc] init]; + for (int i = 0; i < 8; i++) { + NSImage *newImage = nil; + newImage = [[NSImage alloc] initWithSize:NSMakeSize(32, 32)]; + if(newImage == nil) + { + return nil; + } + NSBitmapImageRep *imageRep = [[NSBitmapImageRep alloc] initWithBitmapDataPlanes:NULL + pixelsWide:ROM_ICON_WIDTH + pixelsHigh:ROM_ICON_HEIGHT + bitsPerSample:8 + samplesPerPixel:4 + hasAlpha:YES + isPlanar:NO + colorSpaceName:NSCalibratedRGBColorSpace + bytesPerRow:ROM_ICON_WIDTH * 4 + bitsPerPixel:32]; + if(imageRep == nil) + { + newImage = nil; + return nil; + } + uint32_t *bitmapData = (uint32_t *)[imageRep bitmapData]; + [self RomIconToRGBA8888:bitmapData iconPixelData:(uint32_t *)banner.dsi_icon[i] paletteData:(uint16_t *)banner.palette]; + [newImage addRepresentation:imageRep]; + + [imageArray addObject:newImage]; + } + return imageArray; +} + + + +-(void)RomIconToRGBA8888:(uint32_t *)bitmapData iconPixelData:(uint32_t *)iconPixPtr paletteData:(uint16_t *)clut4 +{ + if (bitmapData == NULL) + { + return; + } + + //const RomBanner &ndsRomBanner = gameInfo.getRomBanner(); // Contains the memory addresses we need to get our read pointer locations. + //const uint32_t *iconPixPtr = (uint32_t *)banner.icon; // Read pointer for the icon's pixel data. + + // Setup the 4-bit CLUT. + // + // The actual color values are stored with the ROM icon data in RGB555 format. + // We convert these color values and store them in the CLUT as RGBA8888 values. + // + // The first entry always represents the alpha, so just set it to 0. + //const uint16_t *clut4 = (uint16_t *)banner.palette; + CACHE_ALIGN uint32_t clut32[16]; + ColorspaceConvertBuffer555To8888Opaque(clut4, clut32, 16); + clut32[0] = 0x00000000; + + // Load the image from the icon pixel data. + // + // ROM icons are stored in 4-bit indexed color and have dimensions of 32x32 pixels. + // Also, ROM icons are split into 16 separate 8x8 pixel tiles arranged in a 4x4 + // array. Here, we sequentially read from the ROM data, and adjust our write + // location appropriately within the bitmap memory block. + for (size_t y = 0; y < 4; y++) + { + for (size_t x = 0; x < 4; x++) + { + for (size_t p = 0; p < 8; p++, iconPixPtr++) + { + // Load an entire row of palette color indices as a single 32-bit chunk. + const uint32_t palIdx = LE_TO_LOCAL_32(*iconPixPtr); + + // Set the write location. The formula below calculates the proper write + // location depending on the position of the read pointer. We use a more + // optimized version of this formula in practice. + // + // bitmapOutPtr = bitmapData + ( ((y * 8) + palIdx) * 32 ) + (x * 8); + uint32_t *bitmapOutPtr = bitmapData + ( ((y << 3) + p) << 5 ) + (x << 3); + *bitmapOutPtr = clut32[(palIdx & 0x0000000F) >> 0]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x000000F0) >> 4]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x00000F00) >> 8]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x0000F000) >> 12]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x000F0000) >> 16]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x00F00000) >> 20]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0x0F000000) >> 24]; + + bitmapOutPtr++; + *bitmapOutPtr = clut32[(palIdx & 0xF0000000) >> 28]; + } + } + } +} + +@end diff --git a/FSCHMasterEditor/nds/README.txt b/FSCHMasterEditor/nds/README.txt new file mode 100644 index 0000000..c50c33d --- /dev/null +++ b/FSCHMasterEditor/nds/README.txt @@ -0,0 +1,6 @@ +Some of this code is taken from the Desmume project, being one of my favorite emulators I *greatly* appreciate their teams time and do not want to re-use without attribution to them somewhere. + +Methods reused; +desmume/src/frontend/cocoa/cocoa_rom.mm +RomIconToRGBA8888(); +- (NSImage *)icon; diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.cpp new file mode 100644 index 0000000..0958568 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.cpp @@ -0,0 +1,1486 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#include "colorspacehandler.h" +#include + +#if defined(ENABLE_AVX512_1) + #include "colorspacehandler_AVX512.cpp" +#endif + +#if defined(ENABLE_AVX2) + #include "colorspacehandler_AVX2.cpp" +#endif + +#if defined(ENABLE_SSE2) + #include "colorspacehandler_SSE2.cpp" +#endif + +#if defined(ENABLE_NEON_A64) + #include "colorspacehandler_NEON.cpp" +#endif + +#if defined(ENABLE_ALTIVEC) + #include "colorspacehandler_AltiVec.cpp" +#endif + +#if defined(ENABLE_AVX512_1) + #define USEVECTORSIZE_512 + #define VECTORSIZE 64 +#elif defined(ENABLE_AVX2) + #define USEVECTORSIZE_256 + #define VECTORSIZE 32 +#elif defined(ENABLE_SSE2) || defined(ENABLE_NEON_A64) || defined(ENABLE_ALTIVEC) + #define USEVECTORSIZE_128 + #define VECTORSIZE 16 +#endif + +// By default, the hand-coded vectorized code will be used instead of a compiler's built-in +// autovectorization (if supported). However, if USEMANUALVECTORIZATION is not defined, then +// the compiler will use autovectorization (if supported). +#if defined(USEVECTORSIZE_128) || defined(USEVECTORSIZE_256) || defined(USEVECTORSIZE_512) + // Comment out USEMANUALVECTORIZATION to disable the hand-coded vectorized code. + #define USEMANUALVECTORIZATION +#endif + +#ifdef USEMANUALVECTORIZATION + #if defined(ENABLE_AVX512_1) + static const ColorspaceHandler_AVX512 csh; + #elif defined(ENABLE_AVX2) + static const ColorspaceHandler_AVX2 csh; + #elif defined(ENABLE_SSE2) + static const ColorspaceHandler_SSE2 csh; + #elif defined(ENABLE_NEON_A64) + static const ColorspaceHandler_NEON csh; + #elif defined(ENABLE_ALTIVEC) + static const ColorspaceHandler_AltiVec csh; + #else + static const ColorspaceHandler csh; + #endif +#else + static const ColorspaceHandler csh; +#endif + +CACHE_ALIGN u16 color_5551_swap_rb[65536]; +CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_666[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +CACHE_ALIGN u32 color_555_to_888[32768]; + +//is this a crazy idea? this table spreads 5 bits evenly over 31 from exactly 0 to INT_MAX +CACHE_ALIGN const u32 material_5bit_to_31bit[32] = { + 0x00000000, 0x04210842, 0x08421084, 0x0C6318C6, + 0x10842108, 0x14A5294A, 0x18C6318C, 0x1CE739CE, + 0x21084210, 0x25294A52, 0x294A5294, 0x2D6B5AD6, + 0x318C6318, 0x35AD6B5A, 0x39CE739C, 0x3DEF7BDE, + 0x42108421, 0x46318C63, 0x4A5294A5, 0x4E739CE7, + 0x5294A529, 0x56B5AD6B, 0x5AD6B5AD, 0x5EF7BDEF, + 0x6318C631, 0x6739CE73, 0x6B5AD6B5, 0x6F7BDEF7, + 0x739CE739, 0x77BDEF7B, 0x7BDEF7BD, 0x7FFFFFFF +}; + +// 5-bit to 6-bit conversions use this formula -- dst = (src == 0) ? 0 : (2*src) + 1 +// Reference GBATEK: http://problemkaputt.de/gbatek.htm#ds3dtextureblending +CACHE_ALIGN const u8 material_5bit_to_6bit[64] = { + 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, + 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, + 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F, + + // Mirror of first 32 bytes of this array. + 0x00, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, + 0x21, 0x23, 0x25, 0x27, 0x29, 0x2B, 0x2D, 0x2F, + 0x31, 0x33, 0x35, 0x37, 0x39, 0x3B, 0x3D, 0x3F +}; + +CACHE_ALIGN const u8 material_5bit_to_8bit[64] = { + 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, + 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, + 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, + 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF, + + // Mirror of first 32 bytes of this array. + 0x00, 0x08, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39, + 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B, + 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD, + 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF +}; + +CACHE_ALIGN const u8 material_6bit_to_8bit[64] = { + 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, + 0x20, 0x24, 0x28, 0x2C, 0x30, 0x34, 0x38, 0x3C, + 0x41, 0x45, 0x49, 0x4D, 0x51, 0x55, 0x59, 0x5D, + 0x61, 0x65, 0x69, 0x6D, 0x71, 0x75, 0x79, 0x7D, + 0x82, 0x86, 0x8A, 0x8E, 0x92, 0x96, 0x9A, 0x9E, + 0xA2, 0xA6, 0xAA, 0xAE, 0xB2, 0xB6, 0xBA, 0xBE, + 0xC3, 0xC7, 0xCB, 0xCF, 0xD3, 0xD7, 0xDB, 0xDF, + 0xE3, 0xE7, 0xEB, 0xEF, 0xF3, 0xF7, 0xFB, 0xFF +}; + +CACHE_ALIGN const u8 material_3bit_to_5bit[64] = { + 0, 4, 8, 13, 17, 22, 26, 31, 0,0,0,0,0,0,0,0, + 0, 4, 8, 13, 17, 22, 26, 31, 0,0,0,0,0,0,0,0, + 0, 4, 8, 13, 17, 22, 26, 31, 0,0,0,0,0,0,0,0, + 0, 4, 8, 13, 17, 22, 26, 31, 0,0,0,0,0,0,0,0 +}; + +CACHE_ALIGN const u8 material_3bit_to_6bit[64] = { + 0, 8, 16, 26, 34, 44, 52, 63, 0,0,0,0,0,0,0,0, + 0, 8, 16, 26, 34, 44, 52, 63, 0,0,0,0,0,0,0,0, + 0, 8, 16, 26, 34, 44, 52, 63, 0,0,0,0,0,0,0,0, + 0, 8, 16, 26, 34, 44, 52, 63, 0,0,0,0,0,0,0,0 +}; + +CACHE_ALIGN const u8 material_3bit_to_8bit[64] = { + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF, 0,0,0,0,0,0,0,0, + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF, 0,0,0,0,0,0,0,0, + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF, 0,0,0,0,0,0,0,0, + 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF, 0,0,0,0,0,0,0,0 +}; + +void ColorspaceHandlerInit() +{ + static bool needInitTables = true; + + if (needInitTables) + { +#define RGB15TO18_BITLOGIC(col) ( (material_5bit_to_6bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | material_5bit_to_6bit[(col)&0x1F] ) +#define RGB15TO18_SWAP_RB_BITLOGIC(col) ( material_5bit_to_6bit[((col)>>10)&0x1F] | (material_5bit_to_6bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_6bit[(col)&0x1F]<<16) ) +#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) +#define RGB15TO24_SWAP_RB_BITLOGIC(col) ( material_5bit_to_8bit[((col)>>10)&0x1F] | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | (material_5bit_to_8bit[(col)&0x1F]<<16) ) + + for (size_t i = 0; i < 32768; i++) + { + color_555_to_666[i] = RGB15TO18_BITLOGIC(i); + color_555_to_6665_opaque[i] = RGB15TO18_BITLOGIC(i) | 0x1F000000; + color_555_to_6665_opaque_swap_rb[i] = RGB15TO18_SWAP_RB_BITLOGIC(i) | 0x1F000000; + + color_555_to_888[i] = RGB15TO24_BITLOGIC(i); + color_555_to_8888_opaque[i] = RGB15TO24_BITLOGIC(i) | 0xFF000000; + color_555_to_8888_opaque_swap_rb[i] = RGB15TO24_SWAP_RB_BITLOGIC(i) | 0xFF000000; + } + +#define RGB16_SWAP_RB_BITLOGIC(col) ( (((col)&0x001F)<<10) | ((col)&0x03E0) | (((col)&0x7C00)>>10) | ((col)&0x8000) ) + + for (size_t i = 0; i < 65536; i++) + { + color_5551_swap_rb[i] = LE_TO_LOCAL_16( RGB16_SWAP_RB_BITLOGIC(i) ); + } + } +} + +template +void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To8888Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To8888Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + break; + + case BESwapIn: + dst[i] = ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapOut: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(src[i]) ); + break; + + case BESwapInOut: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } +} + +template +void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555To6665Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555To6665Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + break; + + case BESwapIn: + dst[i] = ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapOut: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(src[i]) ); + break; + + case BESwapInOut: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } +} + +template +void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To6665_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To6665(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To8888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To8888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } +} + +template +void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer8888To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer8888To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } +} + +template +void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer6665To5551_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer6665To5551(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } +} + +template +void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo8888Opaque_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo8888Opaque(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } +} + +template +void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u16)) * 2)); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555XTo888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer555XTo888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer555XTo888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } +} + +template +void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % ((VECTORSIZE/sizeof(u32)) * 4)); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo888_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo888_SwapRB(src, dst, pixCountVector); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ConvertBuffer888XTo888_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.ConvertBuffer888XTo888(src, dst, pixCountVector); + } + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } +} + +template +void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCount * sizeof(u16)); + return; + } + + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (IS_UNALIGNED) + { + i = csh.CopyBuffer16_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.CopyBuffer16_SwapRB(src, dst, pixCountVector); + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy16(src[i]); + } +} + +template +void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCount * sizeof(u32)); + return; + } + + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); + + if (IS_UNALIGNED) + { + i = csh.CopyBuffer32_SwapRB_IsUnaligned(src, dst, pixCountVector); + } + else + { + i = csh.CopyBuffer32_SwapRB(src, dst, pixCountVector); + } + +#pragma LOOPVECTORIZE_DISABLE +#endif // USEMANUALVECTORIZATION + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy32(src[i]); + } +} + +template +void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u16))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer16_SwapRB_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer16_SwapRB(dst, pixCountVector, intensity); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer16_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer16(dst, pixCountVector, intensity); + } + } + +#endif // USEMANUALVECTORIZATION + + if (intensity > 0.999f) + { + if (SWAP_RB) + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + dst[i] = COLOR5551_SWAP_RB(dst[i]); + } + } + + return; + } + else if (intensity < 0.001f) + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(dst[i]) : dst[i]; + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u16 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } +} + +template +void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) +{ + size_t i = 0; + +#ifdef USEMANUALVECTORIZATION + const size_t pixCountVector = pixCount - (pixCount % (VECTORSIZE / sizeof(u32))); + + if (SWAP_RB) + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer32_SwapRB_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer32_SwapRB(dst, pixCountVector, intensity); + } + } + else + { + if (IS_UNALIGNED) + { + i = csh.ApplyIntensityToBuffer32_IsUnaligned(dst, pixCountVector, intensity); + } + else + { + i = csh.ApplyIntensityToBuffer32(dst, pixCountVector, intensity); + } + } + +#endif // USEMANUALVECTORIZATION + + if (intensity > 0.999f) + { + if (SWAP_RB) + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = dstColor.b; + outColor.b = dstColor.r; + } + } + + return; + } + else if (intensity < 0.001f) + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + if (SWAP_RB) + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 ); + } + } + else + { +#ifdef USEMANUALVECTORIZATION +#pragma LOOPVECTORIZE_DISABLE +#endif + for (; i < pixCount; i++) + { + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + } + } +} + +template +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + break; + + case BESwapSrc: + dst[i] = ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(src[i]) ); + break; + + case BESwapSrcDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } + + return i; +} + +template +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To8888Opaque(src[i]); + break; + + case BESwapSrc: + dst[i] = ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(src[i]) ); + break; + + case BESwapSrcDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To8888Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } + + return i; +} + +template +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque(src, dst, pixCount); +} + +template +size_t ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To8888Opaque_SwapRB(src, dst, pixCount); +} + +template +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + break; + + case BESwapSrc: + dst[i] = ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(src[i]) ); + break; + + case BESwapSrcDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } + + return i; +} + +template +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + switch (BE_BYTESWAP) + { + case BESwapNone: + dst[i] = ColorspaceConvert555To6665Opaque(src[i]); + break; + + case BESwapSrc: + dst[i] = ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])); + break; + + case BESwapDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(src[i]) ); + break; + + case BESwapSrcDst: + dst[i] = LE_TO_LOCAL_32( ColorspaceConvert555To6665Opaque(LE_TO_LOCAL_16(src[i])) ); + break; + } + } + + return i; +} + +template +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque(src, dst, pixCount); +} + +template +size_t ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer555To6665Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To6665(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To6665_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To8888(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To8888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert8888To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer8888To5551_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert6665To5551(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return this->ColorspaceHandler::ConvertBuffer6665To5551_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceConvert888XTo8888Opaque(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo8888Opaque(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo8888Opaque_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert555XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer555XTo888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer555XTo888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + ColorspaceConvert888XTo888(src[i], &dst[i*3]); + } + + return i; +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo888(src, dst, pixCount); +} + +size_t ColorspaceHandler::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return this->ConvertBuffer888XTo888_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy16(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return this->CopyBuffer16_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + size_t i = 0; + + for (; i < pixCount; i++) + { + dst[i] = ColorspaceCopy32(src[i]); + } + + return i; +} + +size_t ColorspaceHandler::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return this->CopyBuffer32_SwapRB(src, dst, pixCount); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + return pixCount; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + u16 outColor = dst[i]; + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u16 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + for (; i < pixCount; i++) + { + dst[i] = COLOR5551_SWAP_RB(dst[i]); + } + + return i; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0x8000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + u16 outColor = COLOR5551_SWAP_RB(dst[i]); + + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u16 a = outColor & 0x8000; + + dst[i] = ( (r << 0) | (g << 5) | (b << 10) | a ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer16(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer16_SwapRB(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + return pixCount; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + size_t i = 0; + + if (intensity > 0.999f) + { + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = dstColor.b; + outColor.b = dstColor.r; + } + + return i; + } + else if (intensity < 0.001f) + { + for (; i < pixCount; i++) + { + dst[i] = dst[i] & 0xFF000000; + } + + return i; + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + + for (; i < pixCount; i++) + { + FragmentColor dstColor; + dstColor.color = dst[i]; + + FragmentColor &outColor = (FragmentColor &)dst[i]; + outColor.r = (u8)( ((u16)dstColor.b * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)dstColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)dstColor.r * intensity_u16) >> 16 ); + } + + return i; +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer32(dst, pixCount, intensity); +} + +size_t ColorspaceHandler::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return this->ApplyIntensityToBuffer32_SwapRB(dst, pixCount, intensity); +} + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); + +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); + +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); + +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); + +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.h new file mode 100644 index 0000000..fe95fe8 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler.h @@ -0,0 +1,434 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . +*/ + +#ifndef COLORSPACEHANDLER_H +#define COLORSPACEHANDLER_H + +#include "types.h" +#include +#include + + +enum NDSColorFormat +{ + // The color format information is packed in a 32-bit value. + // The bits are as follows: + // FFFOOOOO AAAAAABB BBBBGGGG GGRRRRRR + // + // F = Flags (see below) + // O = Color order (see below) + // A = Bit count for alpha [0-63] + // B = Bit count for blue [0-63] + // G = Bit count for green [0-63] + // R = Bit count for red [0-63] + // + // Flags: + // Bit 29: Reverse order flag. + // Set = Bits are in reverse order, usually for little-endian usage. + // Cleared = Bits are in normal order, usually for big-endian usage. + // + // Color order bits, 24-28: + // 0x00 = RGBA, common format + // 0x01 = RGAB + // 0x02 = RBGA + // 0x03 = RBAG + // 0x04 = RAGB + // 0x05 = RABG + // 0x06 = GRBA + // 0x07 = GRAB + // 0x08 = GBRA + // 0x09 = GBAR + // 0x0A = GARB + // 0x0B = GABR + // 0x0C = BRGA + // 0x0D = BRAG + // 0x0E = BGRA, common format + // 0x0F = BGAR + // 0x10 = BARG + // 0x11 = BAGR + // 0x12 = ARGB + // 0x13 = ARBG + // 0x14 = AGRB + // 0x15 = AGBR + // 0x16 = ABRG + // 0x17 = ABGR + + // Color formats used for internal processing. + //NDSColorFormat_ABGR1555_Rev = 0x20045145, + //NDSColorFormat_ABGR5666_Rev = 0x20186186, + //NDSColorFormat_ABGR8888_Rev = 0x20208208, + + // Color formats used by the output framebuffers. + // The 555 format is packed into u16; the 666 and 888 format is packed into u32 + NDSColorFormat_BGR555_Rev = 0x20005145, + NDSColorFormat_BGR666_Rev = 0x20006186, + NDSColorFormat_BGR888_Rev = 0x20008208 +}; + +union FragmentColor +{ + u32 color; + struct + { + u8 r,g,b,a; + }; +}; + +extern CACHE_ALIGN const u32 material_5bit_to_31bit[32]; +extern CACHE_ALIGN const u8 material_5bit_to_6bit[64]; // Padded for vector lookup table routines. Only the first 32 indices are valid. Data is mirrored across 256-bit lanes. +extern CACHE_ALIGN const u8 material_5bit_to_8bit[64]; // Padded for vector lookup table routines. Only the first 32 indices are valid. Data is mirrored across 256-bit lanes. +extern CACHE_ALIGN const u8 material_6bit_to_8bit[64]; +extern CACHE_ALIGN const u8 material_3bit_to_5bit[64]; // Padded for vector lookup table routines. Only the first 8 indices are valid. Data is mirrored across 128-bit lanes. +extern CACHE_ALIGN const u8 material_3bit_to_6bit[64]; // Padded for vector lookup table routines. Only the first 8 indices are valid. Data is mirrored across 128-bit lanes. +extern CACHE_ALIGN const u8 material_3bit_to_8bit[64]; // Padded for vector lookup table routines. Only the first 8 indices are valid. Data is mirrored across 128-bit lanes. + +extern CACHE_ALIGN u16 color_5551_swap_rb[65536]; +extern CACHE_ALIGN u32 color_555_to_6665_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_6665_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_666[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque[32768]; +extern CACHE_ALIGN u32 color_555_to_8888_opaque_swap_rb[32768]; +extern CACHE_ALIGN u32 color_555_to_888[32768]; + +#define COLOR5551_SWAP_RB(col) (color_5551_swap_rb[(col)]) // Swaps the red-blue colors of a 16-bit RGBA5551 color +#define COLOR555TO6665_OPAQUE(col) (color_555_to_6665_opaque[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665_OPAQUE_SWAP_RB(col) (color_555_to_6665_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque sparsely packed 32-bit color containing an RGBA6665 color with R and B components swapped +#define COLOR555TO666(col) (color_555_to_666[(col)]) // Convert a 15-bit color to a fully transparent sparsely packed 32-bit color containing an RGBA6665 color +#define COLOR555TO6665(col,alpha5) (((alpha5)<<24) | color_555_to_666[(col)]) // Convert a 15-bit color to a sparsely packed 32-bit color containing an RGBA6665 color with user-defined alpha + +#define COLOR555TO8888_OPAQUE(col) (color_555_to_8888_opaque[(col)]) // Convert a 15-bit color to an opaque 32-bit color +#define COLOR555TO8888_OPAQUE_SWAP_RB(col) (color_555_to_8888_opaque_swap_rb[(col)]) // Convert a 15-bit color to an opaque 32-bit color with R and B components swapped +#define COLOR555TO888(col) (color_555_to_888[(col)]) // Convert a 15-bit color to an opaque 24-bit color or a fully transparent 32-bit color +#define COLOR555TO8888(col,alpha8) (((alpha8)<<24) | color_555_to_888[(col)]) // Convert a 15-bit color to a 32-bit color with user-defined alpha + +//produce a 15bpp color from individual 5bit components +#define R5G5B5TORGB15(r,g,b) ( (r) | ((g)<<5) | ((b)<<10) ) + +//produce a 16bpp color from individual 5bit components +#define R6G6B6TORGB15(r,g,b) ( ((r)>>1) | (((g)&0x3E)<<4) | (((b)&0x3E)<<9) ) + +void ColorspaceHandlerInit(); + +template +FORCEINLINE u32 ColorspaceConvert555To8888Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO8888_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO8888_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert555To6665Opaque(const u16 src) +{ + return (SWAP_RB) ? COLOR555TO6665_OPAQUE_SWAP_RB(src & 0x7FFF) : COLOR555TO6665_OPAQUE(src & 0x7FFF); +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r) >> 2; + outColor.g = srcColor.g >> 2; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b) >> 2; + outColor.a = srcColor.a >> 3; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert8888To6665(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To6665(srcColorComponent); +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = material_6bit_to_8bit[((SWAP_RB) ? srcColor.b : srcColor.r)]; + outColor.g = material_6bit_to_8bit[srcColor.g]; + outColor.b = material_6bit_to_8bit[((SWAP_RB) ? srcColor.r : srcColor.b)]; + outColor.a = material_5bit_to_8bit[srcColor.a]; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert6665To8888(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To8888(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(FragmentColor srcColor) +{ + return R5G5B5TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r) >> 3, srcColor.g >> 3, ((SWAP_RB) ? srcColor.r : srcColor.b) >> 3) | ((srcColor.a == 0) ? 0x0000 : 0x8000 ); +} + +template +FORCEINLINE u16 ColorspaceConvert8888To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert8888To5551(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(FragmentColor srcColor) +{ + return R6G6B6TORGB15( ((SWAP_RB) ? srcColor.b : srcColor.r), srcColor.g, ((SWAP_RB) ? srcColor.r : srcColor.b)) | ((srcColor.a == 0) ? 0x0000 : 0x8000); +} + +template +FORCEINLINE u16 ColorspaceConvert6665To5551(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert6665To5551(srcColorComponent); +} + +template +FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r; + outColor.g = srcColor.g; + outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b; + outColor.a = 0xFF; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceConvert888XTo8888Opaque(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceConvert888XTo8888Opaque(srcColorComponent); +} + +template +FORCEINLINE void ColorspaceConvert888XTo888(FragmentColor srcColor, u8 *dst) +{ + dst[0] = (SWAP_RB) ? srcColor.b : srcColor.r; + dst[1] = srcColor.g; + dst[2] = (SWAP_RB) ? srcColor.r : srcColor.b; +} + +template +FORCEINLINE void ColorspaceConvert888XTo888(u32 srcColor, u8 *dst) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + ColorspaceConvert888XTo888(srcColorComponent, dst); +} + +template +FORCEINLINE void ColorspaceConvert555XTo888(u16 srcColor, u8 *dst) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = ColorspaceConvert555To8888Opaque(srcColor); + + ColorspaceConvert888XTo888(srcColorComponent, dst); +} + +template +FORCEINLINE u16 ColorspaceCopy16(u16 srcColor) +{ + return (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor; +} + +template +FORCEINLINE u32 ColorspaceCopy32(FragmentColor srcColor) +{ + FragmentColor outColor; + outColor.r = (SWAP_RB) ? srcColor.b : srcColor.r; + outColor.g = srcColor.g; + outColor.b = (SWAP_RB) ? srcColor.r : srcColor.b; + outColor.a = srcColor.a; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceCopy32(u32 srcColor) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceCopy32(srcColorComponent); +} + +template +FORCEINLINE u16 ColorspaceApplyIntensity16(u16 srcColor, float intensity) +{ + u16 outColor = (SWAP_RB) ? COLOR5551_SWAP_RB(srcColor) : srcColor; + + if (intensity > 0.999f) + { + return outColor; + } + else if (intensity < 0.001f) + { + return (outColor & 0x8000); + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + u8 r = (u8)( (((outColor >> 0) & 0x1F) * intensity_u16) >> 16 ); + u8 g = (u8)( (((outColor >> 5) & 0x1F) * intensity_u16) >> 16 ); + u8 b = (u8)( (((outColor >> 10) & 0x1F) * intensity_u16) >> 16 ); + u8 a = outColor & 0x8000; + + return ( (r << 0) | (g << 5) | (b << 10) | a ); +} + +template +FORCEINLINE u32 ColorspaceApplyIntensity32(FragmentColor srcColor, float intensity) +{ + FragmentColor outColor; + outColor.r = ((SWAP_RB) ? srcColor.b : srcColor.r); + outColor.g = srcColor.g; + outColor.b = ((SWAP_RB) ? srcColor.r : srcColor.b); + outColor.a = srcColor.a; + + if (intensity > 0.999f) + { + return outColor.color; + } + else if (intensity < 0.001f) + { + return (outColor.color & 0xFF000000); + } + + const u16 intensity_u16 = (u16)(intensity * (float)(0xFFFF)); + outColor.r = (u8)( ((u16)outColor.r * intensity_u16) >> 16 ); + outColor.g = (u8)( ((u16)outColor.g * intensity_u16) >> 16 ); + outColor.b = (u8)( ((u16)outColor.b * intensity_u16) >> 16 ); + outColor.a = outColor.a; + + return outColor.color; +} + +template +FORCEINLINE u32 ColorspaceApplyIntensity32(u32 srcColor, float intensity) +{ + FragmentColor srcColorComponent; + srcColorComponent.color = srcColor; + + return ColorspaceApplyIntensity32(srcColorComponent); +} + +template void ColorspaceConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount); +template void ColorspaceConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount); +template void ColorspaceConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount); + +template void ColorspaceCopyBuffer16(const u16 *src, u16 *dst, size_t pixCount); +template void ColorspaceCopyBuffer32(const u32 *src, u32 *dst, size_t pixCount); + +template void ColorspaceApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity); +template void ColorspaceApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity); + +class ColorspaceHandler +{ +public: + ColorspaceHandler() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +FORCEINLINE FragmentColor MakeFragmentColor(const u8 r, const u8 g, const u8 b, const u8 a) +{ + FragmentColor ret; + ret.r = r; ret.g = g; ret.b = b; ret.a = a; + return ret; +} + +#endif /* COLORSPACEHANDLER_H */ diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.cpp new file mode 100644 index 0000000..e35747f --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.cpp @@ -0,0 +1,1200 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_AVX2.h" + +#ifndef ENABLE_AVX2 + #error This code requires AVX2 support. +#else + +#include +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707))); + + v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) ); + ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5)); + ga = _mm256_or_si256(ga, srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, ga); + dstHi = _mm256_unpackhi_epi8(rb, ga); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0707)) ); + + v256u16 ba = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) ); + ba = _mm256_or_si256(ba, _mm256_srli_epi16(ba, 5)); + ba = _mm256_or_si256(ba, srcAlphaBits); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + ba = _mm256_permute4x64_epi64(ba, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, ba); + dstHi = _mm256_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v256u16 rb = _mm256_or_si256( _mm256_slli_epi16(srcColor,11), _mm256_and_si256(_mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8)) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0707))); + + v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 2), _mm256_set1_epi16(0x00F8) ); + g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5)); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, g); + dstHi = _mm256_unpackhi_epi8(rb, g); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 3), _mm256_set1_epi16(0x00F8) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 6), _mm256_set1_epi16(0xF800)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi32(rg, 5), _mm256_set1_epi16(0x0707)) ); + + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 7), _mm256_set1_epi16(0x00F8) ); + b = _mm256_or_si256(b, _mm256_srli_epi32(b, 5)); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + b = _mm256_permute4x64_epi64( b, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, b); + dstHi = _mm256_unpackhi_epi16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101))); + + v256u16 ga = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) ); + ga = _mm256_or_si256(ga, _mm256_srli_epi16(ga, 5)); + ga = _mm256_or_si256(ga, srcAlphaBits); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + ga = _mm256_permute4x64_epi64(ga, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, ga); + dstHi = _mm256_unpackhi_epi8(rb, ga); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) ); + const v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) ); + + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) ); + + v256u16 ba = _mm256_or_si256(b, _mm256_srli_epi16(b, 5)); + ba = _mm256_or_si256(ba, srcAlphaBits); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + ba = _mm256_permute4x64_epi64(ba, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, ba); + dstHi = _mm256_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v256u16 rb = _mm256_and_si256( _mm256_or_si256( _mm256_slli_epi16(srcColor,9), _mm256_srli_epi16(srcColor, 9)), _mm256_set1_epi16(0x3E3E) ); + rb = _mm256_or_si256(rb, _mm256_and_si256(_mm256_srli_epi16(rb, 5), _mm256_set1_epi16(0x0101))); + + v256u16 g = _mm256_and_si256(_mm256_srli_epi16(srcColor, 4), _mm256_set1_epi16(0x003E) ); + g = _mm256_or_si256(g, _mm256_srli_epi16(g, 5)); + + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + + dstLo = _mm256_unpacklo_epi8(rb, g); + dstHi = _mm256_unpackhi_epi8(rb, g); + } + else + { + const v256u16 r = _mm256_and_si256( _mm256_slli_epi16(srcColor, 1), _mm256_set1_epi16(0x003E) ); + v256u16 rg = _mm256_or_si256( r, _mm256_and_si256(_mm256_slli_epi16(srcColor, 4), _mm256_set1_epi16(0x3E00)) ); + rg = _mm256_or_si256( rg, _mm256_and_si256(_mm256_srli_epi16(rg, 5), _mm256_set1_epi16(0x0101)) ); + + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(srcColor, 9), _mm256_set1_epi16(0x003E) ); + b = _mm256_or_si256(b, _mm256_srli_epi16(b, 5)); + + rg = _mm256_permute4x64_epi64(rg, 0xD8); + b = _mm256_permute4x64_epi64( b, 0xD8); + + dstLo = _mm256_unpacklo_epi16(rg, b); + dstHi = _mm256_unpackhi_epi16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u16 srcAlphaBits16 = _mm256_set1_epi16(0xFF00); + ColorspaceConvert555To8888_AVX2(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi) +{ + const v256u16 srcAlphaBits16 = _mm256_set1_epi16(0x1F00); + ColorspaceConvert555To6665_AVX2(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v256u32 rgb = _mm256_and_si256( _mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x003F3F3F) ); + const v256u32 a = _mm256_and_si256( _mm256_srli_epi32(src, 3), _mm256_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v256u32 rgb = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 2), _mm256_set1_epi32(0x00FCFCFC)), _mm256_and_si256(_mm256_srli_epi32(src, 4), _mm256_set1_epi32(0x00030303)) ); + const v256u32 a = _mm256_or_si256( _mm256_and_si256(_mm256_slli_epi32(src, 3), _mm256_set1_epi32(0xF8000000)), _mm256_and_si256(_mm256_srli_epi32(src, 2), _mm256_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = _mm256_shuffle_epi8( rgb, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm256_or_si256(rgb, a); +} + +template +FORCEINLINE v256u16 _ConvertColorBaseTo5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v256u32 rgbLo; + v256u32 rgbHi; + v256u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 17), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 17), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 1), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 1), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 4), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packus_epi32( _mm256_and_si256(_mm256_srli_epi32(srcLo, 24), _mm256_set1_epi32(0x0000001F)), _mm256_and_si256(_mm256_srli_epi32(srcHi, 24), _mm256_set1_epi32(0x0000001F)) ); + alpha = _mm256_permute4x64_epi64(alpha, 0xD8); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 19), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_slli_epi32(srcLo, 7), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 19), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_slli_epi32(srcHi, 7), _mm256_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm256_and_si256(_mm256_srli_epi32(srcLo, 3), _mm256_set1_epi32(0x0000001F)); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbLo = _mm256_or_si256(rgbLo, _mm256_and_si256(_mm256_srli_epi32(srcLo, 9), _mm256_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm256_and_si256(_mm256_srli_epi32(srcHi, 3), _mm256_set1_epi32(0x0000001F)); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 6), _mm256_set1_epi32(0x000003E0)) ); + rgbHi = _mm256_or_si256(rgbHi, _mm256_and_si256(_mm256_srli_epi32(srcHi, 9), _mm256_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm256_packus_epi32( _mm256_srli_epi32(srcLo, 24), _mm256_srli_epi32(srcHi, 24) ); + alpha = _mm256_permute4x64_epi64(alpha, 0xD8); + alpha = _mm256_cmpgt_epi16(alpha, _mm256_setzero_si256()); + alpha = _mm256_and_si256(alpha, _mm256_set1_epi16(0x8000)); + } + + return _mm256_or_si256( _mm256_permute4x64_epi64(_mm256_packus_epi32(rgbLo, rgbHi), 0xD8), alpha ); +} + +template +FORCEINLINE v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +FORCEINLINE v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX2(srcLo, srcHi); +} + +template +FORCEINLINE v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src) +{ + if (SWAP_RB) + { + return _mm256_or_si256( _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm256_set1_epi32(0xFF000000) ); + } + + return _mm256_or_si256(src, _mm256_set1_epi32(0xFF000000)); +} + +template +FORCEINLINE v256u16 ColorspaceCopy16_AVX2(const v256u16 &src) +{ + if (SWAP_RB) + { + return _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v256u32 ColorspaceCopy32_AVX2(const v256u32 &src) +{ + if (SWAP_RB) + { + return _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + } + + return src; +} + +template +FORCEINLINE v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity) +{ + v256u16 tempSrc = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(src, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(src, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(src, _mm256_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm256_and_si256(tempSrc, _mm256_set1_epi16(0x8000)); + } + + v256u16 r = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempSrc, 10), _mm256_set1_epi16(0x001F) ); + v256u16 a = _mm256_and_si256( tempSrc, _mm256_set1_epi16(0x8000) ); + + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi16( _mm256_mulhi_epu16(g, intensity_v256), 5 ); + b = _mm256_slli_epi16( _mm256_mulhi_epu16(b, intensity_v256), 10 ); + + return _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); +} + +template +FORCEINLINE v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity) +{ + v256u32 tempSrc = (SWAP_RB) ? _mm256_shuffle_epi8(src, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm256_and_si256(tempSrc, _mm256_set1_epi32(0xFF000000)); + } + + v256u16 rb = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0x00FF00FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempSrc, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempSrc, _mm256_set1_epi32(0xFF000000) ); + + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = _mm256_mulhi_epu16(rb, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 ); + + return _mm256_or_si256( _mm256_or_si256(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AVX2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + v256u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_AVX2(src_vec256, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_storeu_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm256_store_si256((v256u32 *)(dst+i+(sizeof(v256u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert8888To6665_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert6665To8888_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert8888To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AVX2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_loadu_si256((v256u32 *)(src+i)), _mm256_loadu_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), ColorspaceConvert6665To5551_AVX2(_mm256_load_si256((v256u32 *)(src+i)), _mm256_load_si256((v256u32 *)(src+i+(sizeof(v256u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX2(_mm256_loadu_si256((v256u32 *)(src+i))) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX2(_mm256_load_si256((v256u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_AVX2(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + v256u16 src_v256u16[2]; + v256u32 src_v256u32[4]; + + for (; i < pixCountVec256; i+=((sizeof(v256u16)/sizeof(u16)) * 2)) + { + if (IS_UNALIGNED) + { + src_v256u16[0] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) ); + src_v256u16[1] = _mm256_loadu_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) ); + } + else + { + src_v256u16[0] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 0)) ); + src_v256u16[1] = _mm256_load_si256( (v256u16 *)(src + i + ((sizeof(v256u16)/sizeof(u16)) * 1)) ); + } + + v256u16 rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(src_v256u16[0], 11), _mm256_srli_epi16(src_v256u16[0], 7)), _mm256_set1_epi16(0xF8F8) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(src_v256u16[0], 2), _mm256_set1_epi16(0x00F8) ); + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + src_v256u32[0] = _mm256_unpacklo_epi16(rb, g); + src_v256u32[1] = _mm256_unpackhi_epi16(rb, g); + + rb = _mm256_and_si256( _mm256_or_si256(_mm256_slli_epi16(src_v256u16[1], 11), _mm256_srli_epi16(src_v256u16[1], 7)), _mm256_set1_epi16(0xF8F8) ); + g = _mm256_and_si256( _mm256_srli_epi16(src_v256u16[1], 2), _mm256_set1_epi16(0x00F8) ); + rb = _mm256_permute4x64_epi64(rb, 0xD8); + g = _mm256_permute4x64_epi64( g, 0xD8); + src_v256u32[2] = _mm256_unpacklo_epi16(rb, g); + src_v256u32[3] = _mm256_unpackhi_epi16(rb, g); + + src_v256u32[0] = _mm256_or_si256( src_v256u32[0], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[0], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[1] = _mm256_or_si256( src_v256u32[1], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[1], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[2] = _mm256_or_si256( src_v256u32[2], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[2], 5), _mm256_set1_epi32(0x00070707)) ); + src_v256u32[3] = _mm256_or_si256( src_v256u32[3], _mm256_and_si256(_mm256_srli_epi32(src_v256u32[3], 5), _mm256_set1_epi32(0x00070707)) ); + + if (SWAP_RB) + { + src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 29,30,28,25, 26,24,21,22, 20,17,18,16, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(31,27,23,19, 29,30,28,25, 26,24,21,22, 20,17,18,16, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(31,27,23,19, 29,30,28,25, 26,24,21,22, 20,17,18,16, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(31,27,23,19, 29,30,28,25, 26,24,21,22, 20,17,18,16, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + } + else + { + src_v256u32[0] = _mm256_shuffle_epi8( src_v256u32[0], _mm256_set_epi8(31,27,23,19, 28,30,29,24, 26,25,20,22, 21,16,18,17, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v256u32[1] = _mm256_shuffle_epi8( src_v256u32[1], _mm256_set_epi8(31,27,23,19, 28,30,29,24, 26,25,20,22, 21,16,18,17, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v256u32[2] = _mm256_shuffle_epi8( src_v256u32[2], _mm256_set_epi8(31,27,23,19, 28,30,29,24, 26,25,20,22, 21,16,18,17, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v256u32[3] = _mm256_shuffle_epi8( src_v256u32[3], _mm256_set_epi8(31,27,23,19, 28,30,29,24, 26,25,20,22, 21,16,18,17, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v256u32[0] = _mm256_permutevar8x32_epi32( src_v256u32[0], _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0) ); + src_v256u32[1] = _mm256_permutevar8x32_epi32( src_v256u32[1], _mm256_set_epi32(1, 0, 7, 3, 6, 5, 4, 2) ); + src_v256u32[2] = _mm256_permutevar8x32_epi32( src_v256u32[2], _mm256_set_epi32(4, 2, 1, 0, 7, 3, 6, 5) ); + src_v256u32[3] = _mm256_permutevar8x32_epi32( src_v256u32[3], _mm256_set_epi32(6, 5, 4, 2, 1, 0, 7, 3) ); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + } + else + { + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AVX2(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec256) +{ + size_t i = 0; + v256u32 src_v256u32[4]; + + for (; i < pixCountVec256; i+=((sizeof(v256u32)/sizeof(u32)) * 4)) + { + if (IS_UNALIGNED) + { + src_v256u32[0] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) ); + src_v256u32[1] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) ); + src_v256u32[2] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) ); + src_v256u32[3] = _mm256_loadu_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) ); + } + else + { + src_v256u32[0] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 0)) ); + src_v256u32[1] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 1)) ); + src_v256u32[2] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 2)) ); + src_v256u32[3] = _mm256_load_si256( (v256u32 *)(src + i + ((sizeof(v256u32)/sizeof(u32)) * 3)) ); + } + + if (SWAP_RB) + { + src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8(31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + } + else + { + src_v256u32[0] = _mm256_shuffle_epi8(src_v256u32[0], _mm256_set_epi8(31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v256u32[1] = _mm256_shuffle_epi8(src_v256u32[1], _mm256_set_epi8(31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v256u32[2] = _mm256_shuffle_epi8(src_v256u32[2], _mm256_set_epi8(31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v256u32[3] = _mm256_shuffle_epi8(src_v256u32[3], _mm256_set_epi8(31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v256u32[0] = _mm256_permutevar8x32_epi32( src_v256u32[0], _mm256_set_epi32(7, 3, 6, 5, 4, 2, 1, 0) ); + src_v256u32[1] = _mm256_permutevar8x32_epi32( src_v256u32[1], _mm256_set_epi32(1, 0, 7, 3, 6, 5, 4, 2) ); + src_v256u32[2] = _mm256_permutevar8x32_epi32( src_v256u32[2], _mm256_set_epi32(4, 2, 1, 0, 7, 3, 6, 5) ); + src_v256u32[3] = _mm256_permutevar8x32_epi32( src_v256u32[3], _mm256_set_epi32(6, 5, 4, 2, 1, 0, 7, 3) ); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_storeu_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + } + else + { + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 0)), _mm256_blend_epi32(src_v256u32[0], src_v256u32[1], 0xC0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 1)), _mm256_blend_epi32(src_v256u32[1], src_v256u32[2], 0xF0) ); + _mm256_store_si256( (v256u8 *)(dst + (i * 3) + (sizeof(v256u32) * 2)), _mm256_blend_epi32(src_v256u32[2], src_v256u32[3], 0xFC) ); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_AVX2(const u16 *src, u16 *dst, size_t pixCountVec256) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec256 * sizeof(u16)); + return pixCountVec256; + } + + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + v256u16 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(src+i)) : _mm256_load_si256((v256u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2(src_vec256)); + } + else + { + _mm256_store_si256((v256u16 *)(dst+i), ColorspaceCopy16_AVX2(src_vec256)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AVX2(const u32 *src, u32 *dst, size_t pixCountVec256) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec256 * sizeof(u32)); + return pixCountVec256; + } + + size_t i = 0; + + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + v256u32 src_vec256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(src+i)) : _mm256_load_si256((v256u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2(src_vec256)); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i), ColorspaceCopy32_AVX2(src_vec256)); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_AVX2(u16 *dst, size_t pixCountVec256, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + const v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); + const v256u16 tempDst = _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec256; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) ); + } + else + { + _mm256_store_si256( (v256u16 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u16 *)(dst+i)), _mm256_set1_epi16(0x8000)) ); + } + } + } + else + { + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec256; i+=(sizeof(v256u16)/sizeof(u16))) + { + v256u16 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u16 *)(dst+i)) : _mm256_load_si256((v256u16 *)(dst+i)); + v256u16 tempDst = (SWAP_RB) ? _mm256_or_si256( _mm256_or_si256(_mm256_srli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x7C00)), 10), _mm256_or_si256(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x0E30)), _mm256_slli_epi16(_mm256_and_si256(dst_v256, _mm256_set1_epi16(0x001F)), 10))), _mm256_and_si256(dst_v256, _mm256_set1_epi16(0x8000)) ) : dst_v256; + + v256u16 r = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x001F) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi16(tempDst, 5), _mm256_set1_epi16(0x001F) ); + v256u16 b = _mm256_and_si256( _mm256_srli_epi16(tempDst, 10), _mm256_set1_epi16(0x001F) ); + v256u16 a = _mm256_and_si256( tempDst, _mm256_set1_epi16(0x8000) ); + + r = _mm256_mulhi_epu16(r, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16(g, intensity_v256), 5 ); + b = _mm256_slli_epi32( _mm256_mulhi_epu16(b, intensity_v256), 10 ); + + tempDst = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u16 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256((v256u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_AVX2(u32 *dst, size_t pixCountVec256, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + const v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); + const v256u32 tempDst = _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec256; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm256_storeu_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_loadu_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) ); + } + else + { + _mm256_store_si256( (v256u32 *)(dst+i), _mm256_and_si256(_mm256_load_si256((v256u32 *)(dst+i)), _mm256_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v256u16 intensity_v256 = _mm256_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec256; i+=(sizeof(v256u32)/sizeof(u32))) + { + v256u32 dst_v256 = (IS_UNALIGNED) ? _mm256_loadu_si256((v256u32 *)(dst+i)) : _mm256_load_si256((v256u32 *)(dst+i)); + v256u32 tempDst = (SWAP_RB) ? _mm256_shuffle_epi8(dst_v256, _mm256_set_epi8(31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v256; + + v256u16 rb = _mm256_and_si256( tempDst, _mm256_set1_epi32(0x00FF00FF) ); + v256u16 g = _mm256_and_si256( _mm256_srli_epi32(tempDst, 8), _mm256_set1_epi32(0x000000FF) ); + v256u32 a = _mm256_and_si256( tempDst, _mm256_set1_epi32(0xFF000000) ); + + rb = _mm256_mulhi_epu16(rb, intensity_v256); + g = _mm256_slli_epi32( _mm256_mulhi_epu16( g, intensity_v256), 8 ); + + tempDst = _mm256_or_si256( _mm256_or_si256(rb, g), a); + + if (IS_UNALIGNED) + { + _mm256_storeu_si256((v256u32 *)(dst+i), tempDst); + } + else + { + _mm256_store_si256((v256u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX2(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX2(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); + +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); + +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); + +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); + +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); + +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); + +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); + +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); + +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); + +#endif // ENABLE_AVX2 diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.h new file mode 100644 index 0000000..af8f832 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX2.h @@ -0,0 +1,114 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_AVX2_H +#define COLORSPACEHANDLER_AVX2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_AVX2 + #warning This header requires AVX2 support. +#else + +template void ColorspaceConvert555To8888_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665_AVX2(const v256u16 &srcColor, const v256u16 &srcAlphaBits, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX2(const v256u16 &srcColor, v256u32 &dstLo, v256u32 &dstHi); +template v256u32 ColorspaceConvert8888To6665_AVX2(const v256u32 &src); +template v256u32 ColorspaceConvert6665To8888_AVX2(const v256u32 &src); +template v256u16 ColorspaceConvert8888To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u16 ColorspaceConvert6665To5551_AVX2(const v256u32 &srcLo, const v256u32 &srcHi); +template v256u32 ColorspaceConvert888XTo8888Opaque_AVX2(const v256u32 &src); + +template v256u16 ColorspaceCopy16_AVX2(const v256u16 &src); +template v256u32 ColorspaceCopy32_AVX2(const v256u32 &src); + +template v256u16 ColorspaceApplyIntensity16_AVX2(const v256u16 &src, float intensity); +template v256u32 ColorspaceApplyIntensity32_AVX2(const v256u32 &src, float intensity); + +class ColorspaceHandler_AVX2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_AVX2() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_AVX2 + +#endif /* COLORSPACEHANDLER_AVX2_H */ diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.cpp new file mode 100644 index 0000000..713b145 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.cpp @@ -0,0 +1,1153 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_AVX512.h" + +#ifndef ENABLE_AVX512_1 + #error This code requires AVX-512 Tier-1 support. +#else + +#include +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 6), _mm512_set1_epi16(0xF800)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0707)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + ba = _mm512_or_si512(ba, srcAlphaBits); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 3), _mm512_set1_epi16(0x00F8) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 7), _mm512_set1_epi16(0x00F8) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 6), _mm512_set1_epi16(0xF800)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0707)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 4), _mm512_set1_epi16(0x3E00)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0101)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + ba = _mm512_or_si512(ba, srcAlphaBits); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + const v512u16 r = (SWAP_RB) ? _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ); + const v512u16 b = (SWAP_RB) ? _mm512_and_si512( _mm512_slli_epi16(srcColor, 1), _mm512_set1_epi16(0x003E) ) : _mm512_and_si512( _mm512_srli_epi16(srcColor, 9), _mm512_set1_epi16(0x003E) ); + + v512u16 rg = _mm512_or_si512( r, _mm512_and_si512(_mm512_slli_epi16(srcColor, 4), _mm512_set1_epi16(0x3E00)) ); + rg = _mm512_or_si512( rg, _mm512_and_si512(_mm512_srli_epi16(rg, 5), _mm512_set1_epi16(0x0101)) ); + + v512u16 ba = _mm512_or_si512(b, _mm512_srli_epi16(b, 5)); + + dstLo = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x2F,0x0F,0x2E,0x0E,0x2D,0x0D,0x2C,0x0C, 0x2B,0x0B,0x2A,0x0A,0x29,0x09,0x28,0x08, 0x27,0x07,0x26,0x06,0x25,0x05,0x24,0x04, 0x23,0x03,0x22,0x02,0x21,0x01,0x20,0x00), ba); + dstHi = _mm512_permutex2var_epi16(rg, _mm512_set_epi16(0x3F,0x1F,0x3E,0x1E,0x3D,0x1D,0x3C,0x1C, 0x3B,0x1B,0x3A,0x1A,0x39,0x19,0x38,0x18, 0x37,0x17,0x36,0x16,0x35,0x15,0x34,0x14, 0x33,0x13,0x32,0x12,0x31,0x11,0x30,0x10), ba); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + const v512u16 srcAlphaBits16 = _mm512_set1_epi16(0xFF00); + ColorspaceConvert555To8888_AVX512(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi) +{ + const v512u16 srcAlphaBits16 = _mm512_set1_epi16(0x1F00); + ColorspaceConvert555To6665_AVX512(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v512u32 rgb = _mm512_and_si512( _mm512_srli_epi32(src, 2), _mm512_set1_epi32(0x003F3F3F) ); + const v512u32 a = _mm512_and_si512( _mm512_srli_epi32(src, 3), _mm512_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { + rgb = _mm512_shuffle_epi8( rgb, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm512_or_si512(rgb, a); +} + +template +FORCEINLINE v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v512u32 rgb = _mm512_or_si512( _mm512_and_si512(_mm512_slli_epi32(src, 2), _mm512_set1_epi32(0x00FCFCFC)), _mm512_and_si512(_mm512_srli_epi32(src, 4), _mm512_set1_epi32(0x00030303)) ); + const v512u32 a = _mm512_or_si512( _mm512_and_si512(_mm512_slli_epi32(src, 3), _mm512_set1_epi32(0xF8000000)), _mm512_and_si512(_mm512_srli_epi32(src, 2), _mm512_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { + rgb = _mm512_shuffle_epi8( rgb, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); + } + + return _mm512_or_si512(rgb, a); +} + +template +FORCEINLINE v512u16 _ConvertColorBaseTo5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v512u32 rgbLo; + v512u32 rgbHi; + v512u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 17), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 9), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 17), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 9), _mm512_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 1), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 7), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 1), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 4), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 7), _mm512_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm512_packus_epi32( _mm512_and_si512(_mm512_srli_epi32(srcLo, 24), _mm512_set1_epi32(0x0000001F)), _mm512_and_si512(_mm512_srli_epi32(srcHi, 24), _mm512_set1_epi32(0x0000001F)) ); + alpha = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), alpha); + alpha = _mm512_maskz_set1_epi16(_mm512_cmpgt_epi16_mask(alpha, _mm512_setzero_si512()), 0x8000); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 19), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_slli_epi32(srcLo, 7), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 19), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_slli_epi32(srcHi, 7), _mm512_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm512_and_si512(_mm512_srli_epi32(srcLo, 3), _mm512_set1_epi32(0x0000001F)); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbLo = _mm512_or_si512(rgbLo, _mm512_and_si512(_mm512_srli_epi32(srcLo, 9), _mm512_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm512_and_si512(_mm512_srli_epi32(srcHi, 3), _mm512_set1_epi32(0x0000001F)); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 6), _mm512_set1_epi32(0x000003E0)) ); + rgbHi = _mm512_or_si512(rgbHi, _mm512_and_si512(_mm512_srli_epi32(srcHi, 9), _mm512_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm512_packus_epi32( _mm512_srli_epi32(srcLo, 24), _mm512_srli_epi32(srcHi, 24) ); + alpha = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), alpha); + alpha = _mm512_maskz_set1_epi16(_mm512_cmpgt_epi16_mask(alpha, _mm512_setzero_si512()), 0x8000); + } + + return _mm512_or_si512( _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), _mm512_packus_epi32(rgbLo, rgbHi)), alpha ); +} + +template +FORCEINLINE v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX512(srcLo, srcHi); +} + +template +FORCEINLINE v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AVX512(srcLo, srcHi); +} + +template +FORCEINLINE v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src) +{ + if (SWAP_RB) + { + return _mm512_or_si512( _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm512_set1_epi32(0xFF000000) ); + } + + return _mm512_or_si512(src, _mm512_set1_epi32(0xFF000000)); +} + +template +FORCEINLINE v512u16 ColorspaceCopy16_AVX512(const v512u16 &src) +{ + if (SWAP_RB) + { + return _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(src, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(src, _mm512_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v512u32 ColorspaceCopy32_AVX512(const v512u32 &src) +{ + if (SWAP_RB) + { + return _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + } + + return src; +} + +template +FORCEINLINE v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity) +{ + v512u16 tempSrc = (SWAP_RB) ? _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(src, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(src, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(src, _mm512_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm512_and_si512(tempSrc, _mm512_set1_epi16(0x8000)); + } + + v512u16 r = _mm512_and_si512( tempSrc, _mm512_set1_epi16(0x001F) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(tempSrc, 5), _mm512_set1_epi16(0x001F) ); + v512u16 b = _mm512_and_si512( _mm512_srli_epi16(tempSrc, 10), _mm512_set1_epi16(0x001F) ); + v512u16 a = _mm512_and_si512( tempSrc, _mm512_set1_epi16(0x8000) ); + + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm512_mulhi_epu16(r, intensity_v512); + g = _mm512_slli_epi16( _mm512_mulhi_epu16(g, intensity_v512), 5 ); + b = _mm512_slli_epi16( _mm512_mulhi_epu16(b, intensity_v512), 10 ); + + return _mm512_or_si512( _mm512_or_si512( _mm512_or_si512(r, g), b), a); +} + +template +FORCEINLINE v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity) +{ + v512u32 tempSrc = (SWAP_RB) ? _mm512_shuffle_epi8(src, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm512_and_si512(tempSrc, _mm512_set1_epi32(0xFF000000)); + } + + v512u16 rb = _mm512_and_si512( tempSrc, _mm512_set1_epi32(0x00FF00FF) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi32(tempSrc, 8), _mm512_set1_epi32(0x000000FF) ); + v512u32 a = _mm512_and_si512( tempSrc, _mm512_set1_epi32(0xFF000000) ); + + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = _mm512_mulhi_epu16(rb, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16( g, intensity_v512), 8 ); + + return _mm512_or_si512( _mm512_or_si512(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AVX512(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + v512u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_AVX512(src_vec512, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AVX512(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + v512u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_AVX512(src_vec512, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_storeu_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm512_store_si512((v512u32 *)(dst+i+(sizeof(v512u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert8888To6665_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert8888To6665_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert6665To8888_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert6665To8888_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AVX512(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), ColorspaceConvert8888To5551_AVX512(_mm512_loadu_si512((v512u32 *)(src+i)), _mm512_loadu_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), ColorspaceConvert8888To5551_AVX512(_mm512_load_si512((v512u32 *)(src+i)), _mm512_load_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AVX512(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), ColorspaceConvert6665To5551_AVX512(_mm512_loadu_si512((v512u32 *)(src+i)), _mm512_loadu_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), ColorspaceConvert6665To5551_AVX512(_mm512_load_si512((v512u32 *)(src+i)), _mm512_load_si512((v512u32 *)(src+i+(sizeof(v512u32)/sizeof(u32))))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX512(_mm512_loadu_si512((v512u32 *)(src+i))) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_AVX512(_mm512_load_si512((v512u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_AVX512(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + v512u16 src_v512u16[2]; + v512u32 src_v512u32[4]; + + for (; i < pixCountVec512; i+=((sizeof(v512u16)/sizeof(u16)) * 2)) + { + if (IS_UNALIGNED) + { + src_v512u16[0] = _mm512_loadu_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 0)) ); + src_v512u16[1] = _mm512_loadu_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 1)) ); + } + else + { + src_v512u16[0] = _mm512_load_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 0)) ); + src_v512u16[1] = _mm512_load_si512( (v512u16 *)(src + i + ((sizeof(v512u16)/sizeof(u16)) * 1)) ); + } + + v512u16 rb = _mm512_and_si512( _mm512_or_si512(_mm512_slli_epi16(src_v512u16[0], 11), _mm512_srli_epi16(src_v512u16[0], 7)), _mm512_set1_epi16(0xF8F8) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(src_v512u16[0], 2), _mm512_set1_epi16(0x00F8) ); + rb = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), rb); + g = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), g); + src_v512u32[0] = _mm512_unpacklo_epi16(rb, g); + src_v512u32[1] = _mm512_unpackhi_epi16(rb, g); + + rb = _mm512_and_si512( _mm512_or_si512(_mm512_slli_epi16(src_v512u16[1], 11), _mm512_srli_epi16(src_v512u16[1], 7)), _mm512_set1_epi16(0xF8F8) ); + g = _mm512_and_si512( _mm512_srli_epi16(src_v512u16[1], 2), _mm512_set1_epi16(0x00F8) ); + rb = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), rb); + g = _mm512_permutexvar_epi64(_mm512_set_epi64(7,5,3,1,6,4,2,0), g); + src_v512u32[2] = _mm512_unpacklo_epi16(rb, g); + src_v512u32[3] = _mm512_unpackhi_epi16(rb, g); + + src_v512u32[0] = _mm512_or_si512( src_v512u32[0], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[0], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[1] = _mm512_or_si512( src_v512u32[1], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[1], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[2] = _mm512_or_si512( src_v512u32[2], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[2], 5), _mm512_set1_epi32(0x00070707)) ); + src_v512u32[3] = _mm512_or_si512( src_v512u32[3], _mm512_and_si512(_mm512_srli_epi32(src_v512u32[3], 5), _mm512_set1_epi32(0x00070707)) ); + +#ifdef ENABLE_AVX512_2 // The vpermb instruction requires AVX512VBMI. + if (SWAP_RB) + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } + else + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } +#else + if (SWAP_RB) + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + } + else + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v512u32[0] = _mm512_permutexvar_epi32( _mm512_set_epi32(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi32( _mm512_set_epi32( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi32( _mm512_set_epi32( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi32( _mm512_set_epi32(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3), src_v512u32[3] ); +#endif + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + else + { + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AVX512(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec512) +{ + size_t i = 0; + v512u32 src_v512u32[4]; + + for (; i < pixCountVec512; i+=((sizeof(v512u32)/sizeof(u32)) * 4)) + { + if (IS_UNALIGNED) + { + src_v512u32[0] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 0)) ); + src_v512u32[1] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 1)) ); + src_v512u32[2] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 2)) ); + src_v512u32[3] = _mm512_loadu_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 3)) ); + } + else + { + src_v512u32[0] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 0)) ); + src_v512u32[1] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 1)) ); + src_v512u32[2] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 2)) ); + src_v512u32[3] = _mm512_load_si512( (v512u32 *)(src + i + ((sizeof(v512u32)/sizeof(u32)) * 3)) ); + } + +#ifdef ENABLE_AVX512_2 // The vpermb instruction requires AVX512VBMI. + if (SWAP_RB) + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(60,61,62,56, 57,58,52,53, 54,48,49,50, 44,45,46,40, 41,42,36,37, 38,32,33,34, 28,29,30,24, 25,26,20,21, 22,16,17,18, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } + else + { + src_v512u32[0] = _mm512_permutexvar_epi8( _mm512_set_epi8(63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi8( _mm512_set_epi8(20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi8( _mm512_set_epi8(41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3, 62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi8( _mm512_set_epi8(62,61,60,58, 57,56,54,53, 52,50,49,48, 46,45,44,42, 41,40,38,37, 36,34,33,32, 30,29,28,26, 25,24,22,21, 20,18,17,16, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 63,59,55,51, 47,43,39,35, 31,27,23,19, 15,11, 7, 3), src_v512u32[3] ); + } +#else + if (SWAP_RB) + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 60,61,62,56, 57,58,52,53, 54,48,49,50, 47,43,39,35, 44,45,46,40, 41,42,36,37, 38,32,33,34, 31,27,23,19, 28,29,30,24, 25,26,20,21, 22,16,17,18, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + } + else + { + src_v512u32[0] = _mm512_shuffle_epi8(src_v512u32[0], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[1] = _mm512_shuffle_epi8(src_v512u32[1], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[2] = _mm512_shuffle_epi8(src_v512u32[2], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v512u32[3] = _mm512_shuffle_epi8(src_v512u32[3], _mm512_set_epi8(63,59,55,51, 62,61,60,58, 57,56,54,53, 52,50,49,48, 47,43,39,35, 46,45,44,42, 41,40,38,37, 36,34,33,32, 31,27,23,19, 30,29,28,26, 25,24,22,21, 20,18,17,16, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + } + + // This is necessary because vpshufb cannot shuffle bits across 128-bit lanes, but vpermd can. + src_v512u32[0] = _mm512_permutexvar_epi32( _mm512_set_epi32(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0), src_v512u32[0] ); + src_v512u32[1] = _mm512_permutexvar_epi32( _mm512_set_epi32( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5), src_v512u32[1] ); + src_v512u32[2] = _mm512_permutexvar_epi32( _mm512_set_epi32( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10), src_v512u32[2] ); + src_v512u32[3] = _mm512_permutexvar_epi32( _mm512_set_epi32(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3), src_v512u32[3] ); +#endif + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_storeu_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + else + { + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 0)), _mm512_mask_blend_epi32(0xF000, src_v512u32[0], src_v512u32[1]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 1)), _mm512_mask_blend_epi32(0xFF00, src_v512u32[1], src_v512u32[2]) ); + _mm512_store_si512( (v512u8 *)(dst + (i * 3) + (sizeof(v512u32) * 2)), _mm512_mask_blend_epi32(0xFFF0, src_v512u32[2], src_v512u32[3]) ); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_AVX512(const u16 *src, u16 *dst, size_t pixCountVec512) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec512 * sizeof(u16)); + return pixCountVec512; + } + + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(src+i)) : _mm512_load_si512((v512u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u16 *)(dst+i), ColorspaceCopy16_AVX512(src_vec512)); + } + else + { + _mm512_store_si512((v512u16 *)(dst+i), ColorspaceCopy16_AVX512(src_vec512)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AVX512(const u32 *src, u32 *dst, size_t pixCountVec512) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec512 * sizeof(u32)); + return pixCountVec512; + } + + size_t i = 0; + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + v512u32 src_vec512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(src+i)) : _mm512_load_si512((v512u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i), ColorspaceCopy32_AVX512(src_vec512)); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i), ColorspaceCopy32_AVX512(src_vec512)); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_AVX512(u16 *dst, size_t pixCountVec512, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + const v512u16 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(dst+i)) : _mm512_load_si512((v512u16 *)(dst+i)); + const v512u16 tempDst = _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(dst_v512, _mm512_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec512; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u16 *)(dst+i), _mm512_and_si512(_mm512_loadu_si512((v512u16 *)(dst+i)), _mm512_set1_epi16(0x8000)) ); + } + else + { + _mm512_store_si512( (v512u16 *)(dst+i), _mm512_and_si512(_mm512_load_si512((v512u16 *)(dst+i)), _mm512_set1_epi16(0x8000)) ); + } + } + } + else + { + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec512; i+=(sizeof(v512u16)/sizeof(u16))) + { + v512u16 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u16 *)(dst+i)) : _mm512_load_si512((v512u16 *)(dst+i)); + v512u16 tempDst = (SWAP_RB) ? _mm512_or_si512( _mm512_or_si512(_mm512_srli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x7C00)), 10), _mm512_or_si512(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x0E30)), _mm512_slli_epi16(_mm512_and_si512(dst_v512, _mm512_set1_epi16(0x001F)), 10))), _mm512_and_si512(dst_v512, _mm512_set1_epi16(0x8000)) ) : dst_v512; + + v512u16 r = _mm512_and_si512( tempDst, _mm512_set1_epi16(0x001F) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi16(tempDst, 5), _mm512_set1_epi16(0x001F) ); + v512u16 b = _mm512_and_si512( _mm512_srli_epi16(tempDst, 10), _mm512_set1_epi16(0x001F) ); + v512u16 a = _mm512_and_si512( tempDst, _mm512_set1_epi16(0x8000) ); + + r = _mm512_mulhi_epu16(r, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16(g, intensity_v512), 5 ); + b = _mm512_slli_epi32( _mm512_mulhi_epu16(b, intensity_v512), 10 ); + + tempDst = _mm512_or_si512( _mm512_or_si512( _mm512_or_si512(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u16 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512((v512u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_AVX512(u32 *dst, size_t pixCountVec512, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + const v512u32 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(dst+i)) : _mm512_load_si512((v512u32 *)(dst+i)); + const v512u32 tempDst = _mm512_shuffle_epi8(dst_v512, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec512; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm512_storeu_si512( (v512u32 *)(dst+i), _mm512_and_si512(_mm512_loadu_si512((v512u32 *)(dst+i)), _mm512_set1_epi32(0xFF000000)) ); + } + else + { + _mm512_store_si512( (v512u32 *)(dst+i), _mm512_and_si512(_mm512_load_si512((v512u32 *)(dst+i)), _mm512_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v512u16 intensity_v512 = _mm512_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec512; i+=(sizeof(v512u32)/sizeof(u32))) + { + v512u32 dst_v512 = (IS_UNALIGNED) ? _mm512_loadu_si512((v512u32 *)(dst+i)) : _mm512_load_si512((v512u32 *)(dst+i)); + v512u32 tempDst = (SWAP_RB) ? _mm512_shuffle_epi8(dst_v512, _mm512_set_epi8(63,60,61,62, 59,56,57,58, 55,52,53,54, 51,48,49,50, 47,44,45,46, 43,40,41,42, 39,36,37,38, 35,32,33,34, 31,28,29,30, 27,24,25,26, 23,20,21,22, 19,16,17,18, 15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v512; + + v512u16 rb = _mm512_and_si512( tempDst, _mm512_set1_epi32(0x00FF00FF) ); + v512u16 g = _mm512_and_si512( _mm512_srli_epi32(tempDst, 8), _mm512_set1_epi32(0x000000FF) ); + v512u32 a = _mm512_and_si512( tempDst, _mm512_set1_epi32(0xFF000000) ); + + rb = _mm512_mulhi_epu16(rb, intensity_v512); + g = _mm512_slli_epi32( _mm512_mulhi_epu16( g, intensity_v512), 8 ); + + tempDst = _mm512_or_si512( _mm512_or_si512(rb, g), a); + + if (IS_UNALIGNED) + { + _mm512_storeu_si512((v512u32 *)(dst+i), tempDst); + } + else + { + _mm512_store_si512((v512u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AVX512::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AVX512(src, dst, pixCount); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_AVX512::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_AVX512(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); + +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); + +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); + +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); + +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); + +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); + +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); + +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); + +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); + +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); + +#endif // ENABLE_AVX512_1 diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.h new file mode 100644 index 0000000..b04077a --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AVX512.h @@ -0,0 +1,114 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_AVX512_H +#define COLORSPACEHANDLER_AVX512_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_AVX512_1 + #warning This header requires AVX-512 Tier-1 support. +#else + +template void ColorspaceConvert555To8888_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo888X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665_AVX512(const v512u16 &srcColor, const v512u16 &srcAlphaBits, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555XTo666X_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AVX512(const v512u16 &srcColor, v512u32 &dstLo, v512u32 &dstHi); +template v512u32 ColorspaceConvert8888To6665_AVX512(const v512u32 &src); +template v512u32 ColorspaceConvert6665To8888_AVX512(const v512u32 &src); +template v512u16 ColorspaceConvert8888To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u16 ColorspaceConvert6665To5551_AVX512(const v512u32 &srcLo, const v512u32 &srcHi); +template v512u32 ColorspaceConvert888XTo8888Opaque_AVX512(const v512u32 &src); + +template v512u16 ColorspaceCopy16_AVX512(const v512u16 &src); +template v512u32 ColorspaceCopy32_AVX512(const v512u32 &src); + +template v512u16 ColorspaceApplyIntensity16_AVX512(const v512u16 &src, float intensity); +template v512u32 ColorspaceApplyIntensity32_AVX512(const v512u32 &src, float intensity); + +class ColorspaceHandler_AVX512 : public ColorspaceHandler +{ +public: + ColorspaceHandler_AVX512() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_AVX512_1 + +#endif // COLORSPACEHANDLER_AVX512_H diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.cpp new file mode 100755 index 0000000..e289499 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.cpp @@ -0,0 +1,658 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_Altivec.h" + +#ifndef ENABLE_ALTIVEC + #error This code requires PowerPC AltiVec support. +#else + +#include + +template +FORCEINLINE void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + v128u16 srcSwapped; + if ( (BE_BYTESWAP == BESwapSrc) || (BE_BYTESWAP == BESwapSrcDst) ) + { + srcSwapped = vec_perm((v128u8)srcColor, (v128u8)srcColor, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + } + else + { + srcSwapped = srcColor; + } + + dstLo = vec_unpackl((vector pixel)srcSwapped); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstLo, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + + if ( (BE_BYTESWAP == BESwapDst) || (BE_BYTESWAP == BESwapSrcDst) ) + { + dstLo = vec_perm((v128u8)dstLo, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x01,0x02,0x03,0x11, 0x05,0x06,0x07,0x13, 0x09,0x0A,0x0B,0x15, 0x0D,0x0E,0x0F,0x17}) : ((v128u8){0x03,0x02,0x01,0x11, 0x07,0x06,0x05,0x13, 0x0B,0x0A,0x09,0x15, 0x0F,0x0E,0x0D,0x17})); + } + else + { + dstLo = vec_perm((v128u8)dstLo, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); + } + + dstHi = vec_unpackh((vector pixel)srcSwapped); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)dstHi, ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + + if ( (BE_BYTESWAP == BESwapDst) || (BE_BYTESWAP == BESwapSrcDst) ) + { + dstHi = vec_perm((v128u8)dstHi, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x01,0x02,0x03,0x19, 0x05,0x06,0x07,0x1B, 0x09,0x0A,0x0B,0x1D, 0x0D,0x0E,0x0F,0x1F}) : ((v128u8){0x03,0x02,0x01,0x19, 0x07,0x06,0x05,0x1B, 0x0B,0x0A,0x09,0x1D, 0x0F,0x0E,0x0D,0x1F})); + } + else + { + dstHi = vec_perm((v128u8)dstHi, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = {0, 0, 0, 0, 0, 0, 0, 0}; + ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + v128u16 srcSwapped; + if ( (BE_BYTESWAP == BESwapSrc) || (BE_BYTESWAP == BESwapSrcDst) ) + { + srcSwapped = vec_perm((v128u8)srcColor, (v128u8)srcColor, ((v128u8){1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14})); + } + else + { + srcSwapped = srcColor; + } + + dstLo = vec_unpackl((vector pixel)srcSwapped); + dstLo = vec_or( vec_sl((v128u8)dstLo, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstLo, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); + + if ( (BE_BYTESWAP == BESwapDst) || (BE_BYTESWAP == BESwapSrcDst) ) + { + dstLo = vec_perm((v128u8)dstLo, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x01,0x02,0x03,0x11, 0x05,0x06,0x07,0x13, 0x09,0x0A,0x0B,0x15, 0x0D,0x0E,0x0F,0x17}) : ((v128u8){0x03,0x02,0x01,0x11, 0x07,0x06,0x05,0x13, 0x0B,0x0A,0x09,0x15, 0x0F,0x0E,0x0D,0x17})); + } + else + { + dstLo = vec_perm((v128u8)dstLo, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x11,0x03,0x02,0x01, 0x13,0x07,0x06,0x05, 0x15,0x0B,0x0A,0x09, 0x17,0x0F,0x0E,0x0D}) : ((v128u8){0x11,0x01,0x02,0x03, 0x13,0x05,0x06,0x07, 0x15,0x09,0x0A,0x0B, 0x17,0x0D,0x0E,0x0F})); + } + + dstHi = vec_unpackh((vector pixel)srcSwapped); + dstHi = vec_or( vec_sl((v128u8)dstHi, ((v128u8){0,1,1,1, 0,1,1,1, 0,1,1,1, 0,1,1,1})), vec_sr((v128u8)dstHi, ((v128u8){0,4,4,4, 0,4,4,4, 0,4,4,4, 0,4,4,4})) ); + + if ( (BE_BYTESWAP == BESwapDst) || (BE_BYTESWAP == BESwapSrcDst) ) + { + dstHi = vec_perm((v128u8)dstHi, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x01,0x02,0x03,0x19, 0x05,0x06,0x07,0x1B, 0x09,0x0A,0x0B,0x1D, 0x0D,0x0E,0x0F,0x1F}) : ((v128u8){0x03,0x02,0x01,0x19, 0x07,0x06,0x05,0x1B, 0x0B,0x0A,0x09,0x1D, 0x0F,0x0E,0x0D,0x1F})); + } + else + { + dstHi = vec_perm((v128u8)dstHi, (v128u8)srcAlphaBits, (SWAP_RB) ? ((v128u8){0x19,0x03,0x02,0x01, 0x1B,0x07,0x06,0x05, 0x1D,0x0B,0x0A,0x09, 0x1F,0x0F,0x0E,0x0D}) : ((v128u8){0x19,0x01,0x02,0x03, 0x1B,0x05,0x06,0x07, 0x1D,0x09,0x0A,0x0B, 0x1F,0x0D,0x0E,0x0F})); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = {0, 0, 0, 0, 0, 0, 0, 0}; + ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF}; + ColorspaceConvert555To8888_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = {0x1F1F, 0x1F1F, 0x1F1F, 0x1F1F, 0x1F1F, 0x1F1F, 0x1F1F, 0x1F1F}; + ColorspaceConvert555To6665_AltiVec(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u8 rgba = vec_sr( (v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3}) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u8 rgba = vec_or( vec_sl((v128u8)src, ((v128u8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3})), vec_sr((v128u8)src, ((v128u8){4,4,4,2, 4,4,4,2, 4,4,4,2, 4,4,4,2})) ); + + if (SWAP_RB) + { + rgba = vec_perm( rgba, rgba, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15}) ); + } + + return (v128u32)rgba; +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + + v128u16 dstColor; + v128u16 dstAlpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + rgbLo = vec_sl( srcLo, ((v128u32){2,2,2,2}) ); + rgbHi = vec_sl( srcHi, ((v128u32){2,2,2,2}) ); + + // Convert alpha + dstAlpha = vec_packsu( vec_and(vec_sr(srcLo, ((v128u32){24,24,24,24})), ((v128u32){0x0000001F,0x0000001F,0x0000001F,0x0000001F})), vec_and(vec_sr(srcHi, ((v128u32){24,24,24,24})), ((v128u32){0x0000001F,0x0000001F,0x0000001F,0x0000001F})) ); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + rgbLo = srcLo; + rgbHi = srcHi; + + // Convert alpha + dstAlpha = vec_packsu( vec_sr(srcLo, ((v128u32){24,24,24,24})), vec_sr(srcHi, ((v128u32){24,24,24,24})) ); + } + + dstAlpha = vec_cmpgt(dstAlpha, ((v128u16){0,0,0,0,0,0,0,0})); + dstAlpha = vec_and(dstAlpha, ((v128u16){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000})); + + // Convert RGB + if (SWAP_RB) + { + rgbLo = vec_perm( (v128u8)rgbLo, (v128u8)rgbLo, ((v128u8){3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14}) ); + rgbHi = vec_perm( (v128u8)rgbHi, (v128u8)rgbHi, ((v128u8){3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14}) ); + } + else + { + rgbLo = vec_perm( (v128u8)rgbLo, (v128u8)rgbLo, ((v128u8){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}) ); + rgbHi = vec_perm( (v128u8)rgbHi, (v128u8)rgbHi, ((v128u8){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}) ); + } + + dstColor = (v128u16)vec_packpx(rgbLo, rgbHi); + dstColor = vec_and(dstColor, ((v128u16){0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF,0x7FFF})); + + return vec_or(dstColor, dstAlpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_AltiVec(srcLo, srcHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src) +{ + if (SWAP_RB) + { + return vec_or( vec_perm((v128u8)src, (v128u8)src, ((v128u8){3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14})), ((v128u32){0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}) ); + } + + return vec_or( vec_perm((v128u8)src, (v128u8)src, ((v128u8){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12})), ((v128u32){0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000}) ); +} + +template +FORCEINLINE v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src) +{ + if (SWAP_RB) + { + return vec_or( vec_or(vec_sr(vec_and(src, ((v128u16){0x7C00,0x7C00,0x7C00,0x7C00,0x7C00,0x7C00,0x7C00,0x7C00})), ((v128u16){10,10,10,10,10,10,10,10})), vec_or(vec_and(src, ((v128u16){0x0E30,0x0E30,0x0E30,0x0E30,0x0E30,0x0E30,0x0E30,0x0E30})), vec_sl(vec_and(src, ((v128u16){0x001F,0x001F,0x001F,0x001F,0x001F,0x001F,0x001F,0x001F})), ((v128u16){10,10,10,10,10,10,10,10})))), vec_and(src, ((v128u16){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000})) ); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src) +{ + if (SWAP_RB) + { + return vec_perm((v128u8)src, (v128u8)src, ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15})); + } + + return src; +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=sizeof(v128u16)/sizeof(u16)) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To8888Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_AltiVec(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=sizeof(v128u16)/sizeof(u16)) + { + v128u32 dstConvertedLo, dstConvertedHi; + + ColorspaceConvert555To6665Opaque_AltiVec( vec_ld(0, src+i), dstConvertedLo, dstConvertedHi ); + vec_st(dstConvertedHi, 0, dst+i); + vec_st(dstConvertedLo, 16, dst+i); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert8888To6665_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert6665To8888_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert8888To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_AltiVec(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceConvert6665To5551_AltiVec(vec_ld(0, src+i), vec_ld(16, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceConvert888XTo8888Opaque_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_AltiVec(const u16 *src, u8 *dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 src_v128u16[2]; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + src_v128u16[0] = vec_ld( 0, src+i); + src_v128u16[1] = vec_ld(16, src+i); + + src_v128u32[0] = vec_unpackl((vector pixel)src_v128u16[0]); + src_v128u32[1] = vec_unpackh((vector pixel)src_v128u16[0]); + src_v128u32[2] = vec_unpackl((vector pixel)src_v128u16[1]); + src_v128u32[3] = vec_unpackh((vector pixel)src_v128u16[1]); + + src_v128u32[0] = vec_or( vec_sl((v128u8)src_v128u32[0], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[0], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[1] = vec_or( vec_sl((v128u8)src_v128u32[1], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[1], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[2] = vec_or( vec_sl((v128u8)src_v128u32[2], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[2], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + src_v128u32[3] = vec_or( vec_sl((v128u8)src_v128u32[3], ((v128u8){0,3,3,3, 0,3,3,3, 0,3,3,3, 0,3,3,3})), vec_sr((v128u8)src_v128u32[3], ((v128u8){0,2,2,2, 0,2,2,2, 0,2,2,2, 0,2,2,2})) ); + + if (SWAP_RB) + { + src_v128u32[0] = vec_perm( (v128u8)src_v128u32[0], (v128u8)src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) ); + src_v128u32[1] = vec_perm( (v128u8)src_v128u32[1], (v128u8)src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) ); + src_v128u32[2] = vec_perm( (v128u8)src_v128u32[2], (v128u8)src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) ); + } + else + { + src_v128u32[0] = vec_perm( (v128u8)src_v128u32[0], (v128u8)src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) ); + src_v128u32[1] = vec_perm( (v128u8)src_v128u32[1], (v128u8)src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) ); + src_v128u32[2] = vec_perm( (v128u8)src_v128u32[2], (v128u8)src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) ); + } + + vec_st( src_v128u32[0], 0, dst + (i * 3) ); + vec_st( src_v128u32[1], 16, dst + (i * 3) ); + vec_st( src_v128u32[2], 32, dst + (i * 3) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_AltiVec(const u32 *src, u8 *dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=16) + { + src_v128u32[0] = vec_ld( 0, src+i); + src_v128u32[1] = vec_ld(16, src+i); + src_v128u32[2] = vec_ld(32, src+i); + src_v128u32[3] = vec_ld(48, src+i); + + if (SWAP_RB) + { + src_v128u32[0] = vec_perm( (v128u8)src_v128u32[0], (v128u8)src_v128u32[1], ((v128u8){0x05,0x03,0x02,0x01, 0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11}) ); + src_v128u32[1] = vec_perm( (v128u8)src_v128u32[1], (v128u8)src_v128u32[2], ((v128u8){0x0A,0x09,0x07,0x06, 0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16}) ); + src_v128u32[2] = vec_perm( (v128u8)src_v128u32[2], (v128u8)src_v128u32[3], ((v128u8){0x0F,0x0E,0x0D,0x0B, 0x15,0x13,0x12,0x11, 0x1A,0x19,0x17,0x16, 0x1F,0x1E,0x1D,0x1B}) ); + } + else + { + src_v128u32[0] = vec_perm( (v128u8)src_v128u32[0], (v128u8)src_v128u32[1], ((v128u8){0x07,0x01,0x02,0x03, 0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13}) ); + src_v128u32[1] = vec_perm( (v128u8)src_v128u32[1], (v128u8)src_v128u32[2], ((v128u8){0x0A,0x0B,0x05,0x06, 0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16}) ); + src_v128u32[2] = vec_perm( (v128u8)src_v128u32[2], (v128u8)src_v128u32[3], ((v128u8){0x0D,0x0E,0x0F,0x09, 0x17,0x11,0x12,0x13, 0x1A,0x1B,0x15,0x16, 0x1D,0x1E,0x1F,0x19}) ); + } + + vec_st( src_v128u32[0], 0, dst + (i * 3) ); + vec_st( src_v128u32[1], 16, dst + (i * 3) ); + vec_st( src_v128u32[2], 32, dst + (i * 3) ); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_AltiVec(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=8) + { + vec_st( ColorspaceCopy16_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_AltiVec(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=4) + { + vec_st( ColorspaceCopy32_AltiVec(vec_ld(0, src+i)), 0, dst+i ); + } + + return i; +} + +template +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_AltiVec(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_AltiVec::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_AltiVec(src, dst, pixCount); +} + +size_t ColorspaceHandler_AltiVec::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_AltiVec(src, dst, pixCount); +} + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); + +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); + +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); + +#endif // ENABLE_ALTIVEC diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.h new file mode 100644 index 0000000..3078a13 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_AltiVec.h @@ -0,0 +1,83 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_ALTIVEC_H +#define COLORSPACEHANDLER_ALTIVEC_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_ALTIVEC + #warning This header requires PowerPC AltiVec support. +#else + +template void ColorspaceConvert555To8888_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_AltiVec(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_AltiVec(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_AltiVec(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_AltiVec(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_AltiVec(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_AltiVec(const v128u32 &src); + +template v128u16 ColorspaceCopy16_AltiVec(const v128u16 &src); +template v128u32 ColorspaceCopy32_AltiVec(const v128u32 &src); + +// AltiVec has very poor support for dealing with unaligned addresses (it's possible, just +// very obtuse), so we're not even going to bother dealing with any unaligned addresses. +class ColorspaceHandler_AltiVec : public ColorspaceHandler +{ +public: + ColorspaceHandler_AltiVec() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; +}; + +#endif // ENABLE_ALTIVEC + +#endif /* COLORSPACEHANDLER_ALTIVEC_H */ diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.cpp new file mode 100644 index 0000000..81ad657 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.cpp @@ -0,0 +1,1018 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_NEON.h" + +#ifndef ENABLE_NEON_A64 + #error This code requires ARM64 NEON support. +#else + +#include + +#define COLOR16_SWAPRB_NEON(src) vorrq_u16( vshlq_n_u16(vandq_u16(src,vdupq_n_u16(0x001F)),10), vorrq_u16( vandq_u16(src,vdupq_n_u16(0x03E0)), vorrq_u16(vshrq_n_u16(vandq_u16(src,vdupq_n_u16(0x7C00)),10), vandq_u16(src,vdupq_n_u16(0x8000))) ) ) + +#define COLOR32_SWAPRB_NEON(src) vreinterpretq_u32_u8( vqtbl1q_u8(vreinterpretq_u8_u32(src), ((v128u8){2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15})) ) + +template +FORCEINLINE void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = vorrq_u16( vshlq_n_u16(srcColor,11), vandq_u16(vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8)) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0707))); + + v128u16 ga = vandq_u16(vshrq_n_u16(srcColor, 2), vdupq_n_u16(0x00F8) ); + ga = vorrq_u16(ga, vshrq_n_u16(ga, 5)); + ga = vorrq_u16(ga, srcAlphaBits); + + dstLo = vreinterpretq_u32_u8( vzip1q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(ga)) ); + dstHi = vreinterpretq_u32_u8( vzip2q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(ga)) ); + } + else + { + v128u16 rg = vorrq_u16( vandq_u16( vshlq_n_u16(srcColor,3), vdupq_n_u16(0x00F8) ), vandq_u16( vshlq_n_u16(srcColor,6), vdupq_n_u16(0xF800) ) ); + v128u16 ba = vandq_u16( vshrq_n_u16(srcColor,7), vdupq_n_u16(0x00F8) ); + + rg = vorrq_u16( rg, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(rg), 5)) ); + ba = vorrq_u16( ba, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(ba), 5)) ); + ba = vorrq_u16( ba, srcAlphaBits ); + + dstLo = vreinterpretq_u32_u16( vzip1q_u16(rg, ba) ); + dstHi = vreinterpretq_u32_u16( vzip2q_u16(rg, ba) ); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = vorrq_u16( vshlq_n_u16(srcColor,11), vandq_u16(vshrq_n_u16(srcColor, 7), vdupq_n_u16(0x00F8)) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0707))); + + v128u16 g = vandq_u16(vshrq_n_u16(srcColor, 2), vdupq_n_u16(0x00F8) ); + g = vorrq_u16(g, vshrq_n_u16(g, 5)); + + dstLo = vreinterpretq_u32_u8( vzip1q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(g)) ); + dstHi = vreinterpretq_u32_u8( vzip2q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(g)) ); + } + else + { + v128u16 rg = vorrq_u16( vandq_u16( vshlq_n_u16(srcColor,3), vdupq_n_u16(0x00F8) ), vandq_u16( vshlq_n_u16(srcColor,6), vdupq_n_u16(0xF800) ) ); + v128u16 b = vandq_u16( vshrq_n_u16(srcColor,7), vdupq_n_u16(0x00F8) ); + + rg = vorrq_u16( rg, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(rg), 5)) ); + b = vorrq_u16( b, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16( b), 5)) ); + + dstLo = vreinterpretq_u32_u16( vzip1q_u16(rg, b) ); + dstHi = vreinterpretq_u32_u16( vzip2q_u16(rg, b) ); + } +} + +template +FORCEINLINE void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = vandq_u16( vorrq_u16( vshlq_n_u16(srcColor,9), vshrq_n_u16(srcColor, 9)), vdupq_n_u16(0x3E3E) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0101))); + + v128u16 ga = vandq_u16(vshrq_n_u16(srcColor, 4), vdupq_n_u16(0x003E) ); + ga = vorrq_u16(ga, vshrq_n_u16(ga, 5)); + ga = vorrq_u16(ga, srcAlphaBits); + + dstLo = vreinterpretq_u32_u8( vzip1q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(ga)) ); + dstHi = vreinterpretq_u32_u8( vzip2q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(ga)) ); + } + else + { + v128u16 rg = vorrq_u16( vandq_u16( vshlq_n_u16(srcColor,1), vdupq_n_u16(0x003E) ), vandq_u16( vshlq_n_u16(srcColor,4), vdupq_n_u16(0x3E00) ) ); + v128u16 ba = vandq_u16( vshrq_n_u16(srcColor,9), vdupq_n_u16(0x003E) ); + + rg = vorrq_u16( rg, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(rg), 5)) ); + ba = vorrq_u16( ba, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(ba), 5)) ); + ba = vorrq_u16( ba, srcAlphaBits ); + + dstLo = vreinterpretq_u32_u16( vzip1q_u16(rg, ba) ); + dstHi = vreinterpretq_u32_u16( vzip2q_u16(rg, ba) ); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = vandq_u16( vorrq_u16( vshlq_n_u16(srcColor,9), vshrq_n_u16(srcColor, 9)), vdupq_n_u16(0x3E3E) ); + rb = vorrq_u16(rb, vandq_u16(vshrq_n_u16(rb, 5), vdupq_n_u16(0x0101))); + + v128u16 g = vandq_u16(vshrq_n_u16(srcColor, 4), vdupq_n_u16(0x003E) ); + g = vorrq_u16(g, vshrq_n_u16(g, 5)); + + dstLo = vreinterpretq_u32_u8( vzip1q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(g)) ); + dstHi = vreinterpretq_u32_u8( vzip2q_u8(vreinterpretq_u8_u16(rb), vreinterpretq_u8_u16(g)) ); + } + else + { + v128u16 rg = vorrq_u16( vandq_u16( vshlq_n_u16(srcColor,1), vdupq_n_u16(0x003E) ), vandq_u16( vshlq_n_u16(srcColor,4), vdupq_n_u16(0x3E00) ) ); + v128u16 b = vandq_u16( vshrq_n_u16(srcColor,9), vdupq_n_u16(0x003E) ); + + rg = vorrq_u16( rg, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16(rg), 5)) ); + b = vorrq_u16( b, vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_u16( b), 5)) ); + + dstLo = vreinterpretq_u32_u16( vzip1q_u16(rg, b) ); + dstHi = vreinterpretq_u32_u16( vzip2q_u16(rg, b) ); + } +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = vdupq_n_u16(0xFF00); + ColorspaceConvert555To8888_NEON(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = vdupq_n_u16(0x1F00); + ColorspaceConvert555To6665_NEON(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u32 rgba = vreinterpretq_u32_u8( vshlq_u8(vreinterpretq_u8_u32(src), ((v128s8){-2,-2,-2,-3, -2,-2,-2,-3, -2,-2,-2,-3, -2,-2,-2,-3})) ); + + if (SWAP_RB) + { + return COLOR32_SWAPRB_NEON(rgba); + } + + return rgba; +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u32 rgba = vreinterpretq_u32_u8( vorrq_u8( vshlq_u8(vreinterpretq_u8_u32(src), ((v128s8){2,2,2,3, 2,2,2,3, 2,2,2,3, 2,2,2,3})), vshlq_u8(vreinterpretq_u8_u32(src), ((v128s8){-4,-4,-4,-2, -4,-4,-4,-2, -4,-4,-4,-2, -4,-4,-4,-2})) ) ); + + if (SWAP_RB) + { + return COLOR32_SWAPRB_NEON(rgba); + } + + return rgba; +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return vreinterpretq_u16_u32(srcLo); + } + + v128u32 rgbLo; + v128u32 rgbHi; + v128u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 17), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 4), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshlq_n_u32(srcLo, 9), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 17), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 4), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshlq_n_u32(srcHi, 9), vdupq_n_u32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 1), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 4), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 7), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 1), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 4), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 7), vdupq_n_u32(0x00007C00)) ); + } + + // Convert alpha + alpha = vuzp1q_u16( vreinterpretq_u16_u32(vandq_u32(vshrq_n_u32(srcLo, 24), vdupq_n_u32(0x0000001F))), vreinterpretq_u16_u32(vandq_u32(vshrq_n_u32(srcHi, 24), vdupq_n_u32(0x0000001F))) ); + alpha = vcgtq_u16(alpha, vdupq_n_u16(0)); + alpha = vandq_u16(alpha, vdupq_n_u16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 19), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 6), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshlq_n_u32(srcLo, 7), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 19), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 6), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshlq_n_u32(srcHi, 7), vdupq_n_u32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = vandq_u32(vshrq_n_u32(srcLo, 3), vdupq_n_u32(0x0000001F)); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 6), vdupq_n_u32(0x000003E0)) ); + rgbLo = vorrq_u32(rgbLo, vandq_u32(vshrq_n_u32(srcLo, 9), vdupq_n_u32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = vandq_u32(vshrq_n_u32(srcHi, 3), vdupq_n_u32(0x0000001F)); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 6), vdupq_n_u32(0x000003E0)) ); + rgbHi = vorrq_u32(rgbHi, vandq_u32(vshrq_n_u32(srcHi, 9), vdupq_n_u32(0x00007C00)) ); + } + + // Convert alpha + alpha = vuzp1q_u16( vreinterpretq_u16_u32(vshrq_n_u32(srcLo, 24)), vreinterpretq_u16_u32(vshrq_n_u32(srcHi, 24)) ); + alpha = vcgtq_u16(alpha, vdupq_n_u16(0)); + alpha = vandq_u16(alpha, vdupq_n_u16(0x8000)); + } + + return vorrq_u16( vuzp1q_u16(vreinterpretq_u16_u32(rgbLo), vreinterpretq_u16_u32(rgbHi)), alpha ); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_NEON(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_NEON(srcLo, srcHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src) +{ + if (SWAP_RB) + { + return vorrq_u32( COLOR32_SWAPRB_NEON(src), vdupq_n_u32(0xFF000000) ); + } + + return vorrq_u32( src, vdupq_n_u32(0xFF000000) ); +} + +template +FORCEINLINE v128u16 ColorspaceCopy16_NEON(const v128u16 &src) +{ + if (SWAP_RB) + { + return COLOR16_SWAPRB_NEON(src); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_NEON(const v128u32 &src) +{ + if (SWAP_RB) + { + return COLOR32_SWAPRB_NEON(src); + } + + return src; +} + +template +FORCEINLINE v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity) +{ + v128u16 tempSrc = ColorspaceCopy16_NEON(src); + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return vandq_u16(tempSrc, vdupq_n_u16(0x8000)); + } + + v128u16 r = vandq_u16( tempSrc, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(tempSrc, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(tempSrc, 10), vdupq_n_u16(0x001F) ); + v128u16 a = vandq_u16( tempSrc, vdupq_n_u16(0x8000) ); + + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + r = vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(r), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(r), intensityVec)) ); + g = vshlq_n_u16( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(g), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(g), intensityVec)) ), 5 ); + b = vshlq_n_u16( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(b), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(b), intensityVec)) ), 10 ); + + return vorrq_u16( vorrq_u16( vorrq_u16(r, g), b), a); +} + +template +FORCEINLINE v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity) +{ + v128u32 tempSrc = ColorspaceCopy32_NEON(src); + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return vandq_u32(tempSrc, vdupq_n_u32(0xFF000000)); + } + + v128u32 rb = vandq_u32( tempSrc, vdupq_n_u32(0x00FF00FF) ); + v128u32 g = vandq_u32( vshrq_n_u32(tempSrc, 8), vdupq_n_u32(0x000000FF) ); + v128u32 a = vandq_u32( tempSrc, vdupq_n_u32(0xFF000000) ); + + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(rb)), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(rb)), intensityVec) ); + g = vshlq_n_u32( vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(g) ), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(g) ), intensityVec) ), 8 ); + + return vorrq_u32( vorrq_u32(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_NEON(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + v128u16 srcVec; + uint32x4x2_t dstVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u16(src+i); + ColorspaceConvert555To8888Opaque_NEON(srcVec, dstVec.val[0], dstVec.val[1]); + vst1q_u32_x2(dst+i, dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_NEON(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 srcVec; + uint32x4x2_t dstVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u16(src+i); + ColorspaceConvert555To6665Opaque_NEON(srcVec, dstVec.val[0], dstVec.val[1]); + vst1q_u32_x2(dst+i, dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + vst1q_u32( dst+i, ColorspaceConvert8888To6665_NEON(vld1q_u32(src+i)) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + vst1q_u32( dst+i, ColorspaceConvert6665To8888_NEON(vld1q_u32(src+i)) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_NEON(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint32x4x2_t srcVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u32_x2(src+i); + vst1q_u16( dst+i, ColorspaceConvert8888To5551_NEON(srcVec.val[0], srcVec.val[1]) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_NEON(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint32x4x2_t srcVec; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + srcVec = vld1q_u32_x2(src+i); + vst1q_u16( dst+i, ColorspaceConvert6665To5551_NEON(srcVec.val[0], srcVec.val[1]) ); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + uint8x16x4_t srcVec_x4; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) + { + srcVec_x4 = vld4q_u8((u8 *)(src+i)); + + if (SWAP_RB) + { + srcVec_x4.val[3] = srcVec_x4.val[0]; // Use the alpha channel as temp storage since we're overwriting it anyways. + srcVec_x4.val[0] = srcVec_x4.val[2]; + srcVec_x4.val[2] = srcVec_x4.val[3]; + } + + srcVec_x4.val[3] = vdupq_n_u8(0xFF); + vst4q_u8((u8 *)(dst+i), *((uint8x16x4_t *)&srcVec_x4)); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555XTo888_NEON(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint16x8x2_t srcVec; + uint8x16x3_t dstVec; + uint16x8_t tempRBLo; + uint16x8_t tempRBHi; + + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2)) + { + srcVec = vld1q_u16_x2(src+i); + tempRBLo = vorrq_u16( vshlq_n_u16(srcVec.val[0], 11), vshrq_n_u16(srcVec.val[0], 7) ); + tempRBHi = vorrq_u16( vshlq_n_u16(srcVec.val[1], 11), vshrq_n_u16(srcVec.val[1], 7) ); + + if (SWAP_RB) + { + dstVec.val[2] = vandq_u8( vuzp1q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + dstVec.val[0] = vandq_u8( vuzp2q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + } + else + { + dstVec.val[0] = vandq_u8( vuzp1q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + dstVec.val[2] = vandq_u8( vuzp2q_u8(vreinterpretq_u8_u16(tempRBLo), vreinterpretq_u8_u16(tempRBHi)), vdupq_n_u8(0xF8) ); + } + + dstVec.val[1] = vandq_u8( vuzp1q_u8( vreinterpretq_u8_u16(vshrq_n_u16(srcVec.val[0], 2)), vreinterpretq_u8_u16(vshrq_n_u16(srcVec.val[1], 2)) ), vdupq_n_u8(0xF8) ); + + dstVec.val[0] = vorrq_u8(dstVec.val[0], vshrq_n_u8(dstVec.val[0], 5)); + dstVec.val[1] = vorrq_u8(dstVec.val[1], vshrq_n_u8(dstVec.val[1], 5)); + dstVec.val[2] = vorrq_u8(dstVec.val[2], vshrq_n_u8(dstVec.val[2], 5)); + + vst3q_u8(dst+(i*3), dstVec); + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_NEON(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + uint8x16x4_t srcVec_x4; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) + { + srcVec_x4 = vld4q_u8((u8 *)(src+i)); + + if (SWAP_RB) + { + srcVec_x4.val[3] = srcVec_x4.val[0]; // Use the alpha channel as temp storage since we're dropping it anyways. + srcVec_x4.val[0] = srcVec_x4.val[2]; + srcVec_x4.val[2] = srcVec_x4.val[3]; + } + + vst3q_u8(dst+(i*3), *((uint8x16x3_t *)&srcVec_x4)); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer16_NEON(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 src_vec128 = vld1q_u16(src+i); + vst1q_u16(dst+i, ColorspaceCopy16_NEON(src_vec128)); + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_NEON(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 src_vec128 = vld1q_u32(src+i); + vst1q_u32(dst+i, ColorspaceCopy32_NEON(src_vec128)); + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_NEON(u16 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + const v128u16 dstVec = vld1q_u16(dst+i); + const v128u16 tempDst = COLOR16_SWAPRB_NEON(dstVec); + vst1q_u16(dst+i, tempDst); + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + const uint16x8_t alphaMask = vdupq_n_u16(0x8000); + uint16x8x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16))*4)) + { + src = vld1q_u16_x4(dst+i); + src.val[0] = vandq_u16(src.val[0], alphaMask); + src.val[1] = vandq_u16(src.val[1], alphaMask); + src.val[2] = vandq_u16(src.val[2], alphaMask); + src.val[3] = vandq_u16(src.val[3], alphaMask); + + vst1q_u16_x4(dst+i, src); + } + } + else + { + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + const v128u16 dstVec = vld1q_u16(dst+i); + v128u16 tempDst = (SWAP_RB) ? COLOR16_SWAPRB_NEON(dstVec) : dstVec; + + v128u16 r = vandq_u16( tempDst, vdupq_n_u16(0x001F) ); + v128u16 g = vandq_u16( vshrq_n_u16(tempDst, 5), vdupq_n_u16(0x001F) ); + v128u16 b = vandq_u16( vshrq_n_u16(tempDst, 10), vdupq_n_u16(0x001F) ); + v128u16 a = vandq_u16( tempDst, vdupq_n_u16(0x8000) ); + + r = vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(r), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(r), intensityVec)) ); + g = vshlq_n_u16( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(g), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(g), intensityVec)) ), 5 ); + b = vshlq_n_u16( vuzp2q_u16( vreinterpretq_u16_u32(vmull_u16(vget_low_u16(b), intensityVec)), vreinterpretq_u16_u32(vmull_u16(vget_high_u16(b), intensityVec)) ), 10 ); + + tempDst = vorrq_u16( vorrq_u16( vorrq_u16(r, g), b), a); + + vst1q_u16(dst+i, tempDst); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_NEON(u32 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + uint32x4x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32))*4)) + { + src = vld1q_u32_x4(dst+i); + src.val[0] = COLOR32_SWAPRB_NEON(src.val[0]); + src.val[1] = COLOR32_SWAPRB_NEON(src.val[1]); + src.val[2] = COLOR32_SWAPRB_NEON(src.val[2]); + src.val[3] = COLOR32_SWAPRB_NEON(src.val[3]); + + vst1q_u32_x4(dst+i, src); + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + const uint32x4_t alphaMask = vdupq_n_u32(0xFF000000); + uint32x4x4_t src; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32))*4)) + { + src = vld1q_u32_x4(dst+i); + src.val[0] = vandq_u32(src.val[0], alphaMask); + src.val[1] = vandq_u32(src.val[1], alphaMask); + src.val[2] = vandq_u32(src.val[2], alphaMask); + src.val[3] = vandq_u32(src.val[3], alphaMask); + + vst1q_u32_x4(dst+i, src); + } + } + else + { + const uint16x4_t intensityVec = vdup_n_u16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 dstVec = vld1q_u32(dst+i); + v128u32 tempDst = (SWAP_RB) ? COLOR32_SWAPRB_NEON(dstVec) : dstVec; + + v128u32 rb = vandq_u32( tempDst, vdupq_n_u32(0x00FF00FF) ); + v128u32 g = vandq_u32( vshrq_n_u32(tempDst, 8), vdupq_n_u32(0x000000FF) ); + v128u32 a = vandq_u32( tempDst, vdupq_n_u32(0xFF000000) ); + + rb = vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(rb)), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(rb)), intensityVec) ); + g = vshlq_n_u32( vuzp2q_u32( vmull_u16(vget_low_u16(vreinterpretq_u16_u32(g) ), intensityVec), vmull_u16(vget_high_u16(vreinterpretq_u16_u32(g) ), intensityVec) ), 8 ); + + tempDst = vorrq_u32( vorrq_u32(rb, g), a); + vst1q_u32(dst+i, tempDst); + } + } + + return i; +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_NEON::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_NEON(src, dst, pixCount); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_NEON::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_NEON(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); + +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); + +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); + +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); + +#endif // ENABLE_NEON_A64 diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.h new file mode 100644 index 0000000..0669fb6 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_NEON.h @@ -0,0 +1,114 @@ +/* + Copyright (C) 2016-2022 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_NEON_H +#define COLORSPACEHANDLER_NEON_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_NEON_A64 + #warning This header requires ARM64 NEON support. +#else + +template void ColorspaceConvert555To8888_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_NEON(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_NEON(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_NEON(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_NEON(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_NEON(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 C6olorspaceConvert888XTo8888Opaque_NEON(const v128u32 &src); + +template v128u16 ColorspaceCopy16_NEON(const v128u16 &src); +template v128u32 ColorspaceCopy32_NEON(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_NEON(const v128u16 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_NEON(const v128u32 &src, float intensity); + +class ColorspaceHandler_NEON : public ColorspaceHandler +{ +public: + ColorspaceHandler_NEON() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_NEON_A64 + +#endif // COLORSPACEHANDLER_NEON_H diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.cpp b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.cpp new file mode 100644 index 0000000..d933ed4 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.cpp @@ -0,0 +1,1236 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#include "colorspacehandler_SSE2.h" + +#ifndef ENABLE_SSE2 + #error This code requires SSE2 support. +#else + +#include +#include + +#ifdef ENABLE_SSSE3 +#include +#endif + +#ifdef ENABLE_SSE4_1 +#include +#endif + +template +FORCEINLINE void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707))); + + v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) ); + ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5)); + ga = _mm_or_si128(ga, srcAlphaBits); + + dstLo = _mm_unpacklo_epi8(rb, ga); + dstHi = _mm_unpackhi_epi8(rb, ga); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) ); + + v128u16 ba = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); + ba = _mm_or_si128(ba, _mm_srli_epi16(ba, 5)); + ba = _mm_or_si128(ba, srcAlphaBits); + + dstLo = _mm_unpacklo_epi16(rg, ba); + dstHi = _mm_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 8-bit formula: dstRGB8 = (srcRGB5 << 3) | ((srcRGB5 >> 2) & 0x07) + + if (SWAP_RB) + { + v128u16 rb = _mm_or_si128( _mm_slli_epi16(srcColor,11), _mm_and_si128(_mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8)) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0707))); + + v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 2), _mm_set1_epi16(0x00F8) ); + g = _mm_or_si128(g, _mm_srli_epi16(g, 5)); + + dstLo = _mm_unpacklo_epi8(rb, g); + dstHi = _mm_unpackhi_epi8(rb, g); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 3), _mm_set1_epi16(0x00F8) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 6), _mm_set1_epi16(0xF800)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0707)) ); + + v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 7), _mm_set1_epi16(0x00F8) ); + b = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + + dstLo = _mm_unpacklo_epi16(rg, b); + dstHi = _mm_unpackhi_epi16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101))); + + v128u16 ga = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) ); + ga = _mm_or_si128(ga, _mm_srli_epi16(ga, 5)); + ga = _mm_or_si128(ga, srcAlphaBits); + + dstLo = _mm_unpacklo_epi8(rb, ga); + dstHi = _mm_unpackhi_epi8(rb, ga); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); + const v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); + + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) ); + + v128u16 ba = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + ba = _mm_or_si128(ba, srcAlphaBits); + + dstLo = _mm_unpacklo_epi16(rg, ba); + dstHi = _mm_unpackhi_epi16(rg, ba); + } +} + +template +FORCEINLINE void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + // Conversion algorithm: + // RGB 5-bit to 6-bit formula: dstRGB6 = (srcRGB5 << 1) | ((srcRGB5 >> 4) & 0x01) + + if (SWAP_RB) + { + v128u16 rb = _mm_and_si128( _mm_or_si128( _mm_slli_epi16(srcColor,9), _mm_srli_epi16(srcColor, 9)), _mm_set1_epi16(0x3E3E) ); + rb = _mm_or_si128(rb, _mm_and_si128(_mm_srli_epi16(rb, 5), _mm_set1_epi16(0x0101))); + + v128u16 g = _mm_and_si128(_mm_srli_epi16(srcColor, 4), _mm_set1_epi16(0x003E) ); + g = _mm_or_si128(g, _mm_srli_epi16(g, 5)); + + dstLo = _mm_unpacklo_epi8(rb, g); + dstHi = _mm_unpackhi_epi8(rb, g); + } + else + { + const v128u16 r = _mm_and_si128( _mm_slli_epi16(srcColor, 1), _mm_set1_epi16(0x003E) ); + v128u16 rg = _mm_or_si128( r, _mm_and_si128(_mm_slli_epi16(srcColor, 4), _mm_set1_epi16(0x3E00)) ); + rg = _mm_or_si128( rg, _mm_and_si128(_mm_srli_epi16(rg, 5), _mm_set1_epi16(0x0101)) ); + + v128u16 b = _mm_and_si128( _mm_srli_epi16(srcColor, 9), _mm_set1_epi16(0x003E) ); + b = _mm_or_si128(b, _mm_srli_epi16(b, 5)); + + dstLo = _mm_unpacklo_epi16(rg, b); + dstHi = _mm_unpackhi_epi16(rg, b); + } +} + +template +FORCEINLINE void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = _mm_set1_epi16(0xFF00); + ColorspaceConvert555To8888_SSE2(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi) +{ + const v128u16 srcAlphaBits16 = _mm_set1_epi16(0x1F00); + ColorspaceConvert555To6665_SSE2(srcColor, srcAlphaBits16, dstLo, dstHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 8-bit to 6-bit formula: dstRGB6 = (srcRGB8 >> 2) + // Alpha 8-bit to 6-bit formula: dstA5 = (srcA8 >> 3) + v128u32 rgb; + const v128u32 a = _mm_and_si128( _mm_srli_epi32(src, 3), _mm_set1_epi32(0x1F000000) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_and_si128(_mm_srli_epi32(src, 18), _mm_set1_epi32(0x0000003F)), _mm_or_si128( _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x00003F00)), _mm_and_si128(_mm_slli_epi32(src, 14), _mm_set1_epi32(0x003F0000))) ); +#endif + } + else + { + rgb = _mm_and_si128( _mm_srli_epi32(src, 2), _mm_set1_epi32(0x003F3F3F) ); + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src) +{ + // Conversion algorithm: + // RGB 6-bit to 8-bit formula: dstRGB8 = (srcRGB6 << 2) | ((srcRGB6 >> 4) & 0x03) + // Alpha 5-bit to 8-bit formula: dstA8 = (srcA5 << 3) | ((srcA5 >> 2) & 0x07) + v128u32 rgb = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 2), _mm_set1_epi32(0x00FCFCFC)), _mm_and_si128(_mm_srli_epi32(src, 4), _mm_set1_epi32(0x00030303)) ); + const v128u32 a = _mm_or_si128( _mm_and_si128(_mm_slli_epi32(src, 3), _mm_set1_epi32(0xF8000000)), _mm_and_si128(_mm_srli_epi32(src, 2), _mm_set1_epi32(0x07000000)) ); + + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + rgb = _mm_shuffle_epi8( rgb, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2) ); +#else + rgb = _mm_or_si128( _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16)) ); +#endif + } + + return _mm_or_si128(rgb, a); +} + +template +FORCEINLINE v128u16 _ConvertColorBaseTo5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + if (COLORFORMAT == NDSColorFormat_BGR555_Rev) + { + return srcLo; + } + + v128u32 rgbLo; + v128u32 rgbHi; + v128u16 alpha; + + if (COLORFORMAT == NDSColorFormat_BGR666_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 17), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 17), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 1), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 4), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 1), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 4), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_and_si128(_mm_srli_epi32(srcLo, 24), _mm_set1_epi32(0x0000001F)), _mm_and_si128(_mm_srli_epi32(srcHi, 24), _mm_set1_epi32(0x0000001F)) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + else if (COLORFORMAT == NDSColorFormat_BGR888_Rev) + { + if (SWAP_RB) + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 19), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_slli_epi32(srcLo, 7), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 19), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_slli_epi32(srcHi, 7), _mm_set1_epi32(0x00007C00)) ); + } + else + { + // Convert color from low bits + rgbLo = _mm_and_si128(_mm_srli_epi32(srcLo, 3), _mm_set1_epi32(0x0000001F)); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 6), _mm_set1_epi32(0x000003E0)) ); + rgbLo = _mm_or_si128(rgbLo, _mm_and_si128(_mm_srli_epi32(srcLo, 9), _mm_set1_epi32(0x00007C00)) ); + + // Convert color from high bits + rgbHi = _mm_and_si128(_mm_srli_epi32(srcHi, 3), _mm_set1_epi32(0x0000001F)); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 6), _mm_set1_epi32(0x000003E0)) ); + rgbHi = _mm_or_si128(rgbHi, _mm_and_si128(_mm_srli_epi32(srcHi, 9), _mm_set1_epi32(0x00007C00)) ); + } + + // Convert alpha + alpha = _mm_packs_epi32( _mm_srli_epi32(srcLo, 24), _mm_srli_epi32(srcHi, 24) ); + alpha = _mm_cmpgt_epi16(alpha, _mm_setzero_si128()); + alpha = _mm_and_si128(alpha, _mm_set1_epi16(0x8000)); + } + + return _mm_or_si128(_mm_packs_epi32(rgbLo, rgbHi), alpha); +} + +template +FORCEINLINE v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +FORCEINLINE v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi) +{ + return _ConvertColorBaseTo5551_SSE2(srcLo, srcHi); +} + +template +FORCEINLINE v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src) +{ + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + return _mm_or_si128( _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)), _mm_set1_epi32(0xFF000000) ); +#else + return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_set1_epi32(0xFF000000) ); +#endif + } + + return _mm_or_si128(src, _mm_set1_epi32(0xFF000000)); +} + +template +FORCEINLINE v128u16 ColorspaceCopy16_SSE2(const v128u16 &src) +{ + if (SWAP_RB) + { + return _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) ); + } + + return src; +} + +template +FORCEINLINE v128u32 ColorspaceCopy32_SSE2(const v128u32 &src) +{ + if (SWAP_RB) + { +#ifdef ENABLE_SSSE3 + return _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); +#else + return _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) ); +#endif + } + + return src; +} + +template +FORCEINLINE v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity) +{ + v128u16 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(src, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(src, _mm_set1_epi16(0x8000)) ) : src; + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm_and_si128(tempSrc, _mm_set1_epi16(0x8000)); + } + + v128u16 r = _mm_and_si128( tempSrc, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(tempSrc, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(tempSrc, 10), _mm_set1_epi16(0x001F) ); + v128u16 a = _mm_and_si128( tempSrc, _mm_set1_epi16(0x8000) ); + + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 ); + b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 ); + + return _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); +} + +template +FORCEINLINE v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity) +{ +#ifdef ENABLE_SSSE3 + v128u32 tempSrc = (SWAP_RB) ? _mm_shuffle_epi8(src, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : src; +#else + v128u32 tempSrc = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(src, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(src, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(src, _mm_set1_epi32(0xFF000000)) ) : src; +#endif + + if (intensity > 0.999f) + { + return tempSrc; + } + else if (intensity < 0.001f) + { + return _mm_and_si128(tempSrc, _mm_set1_epi32(0xFF000000)); + } + + v128u16 rb = _mm_and_si128( tempSrc, _mm_set1_epi32(0x00FF00FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempSrc, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempSrc, _mm_set1_epi32(0xFF000000) ); + + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + rb = _mm_mulhi_epu16(rb, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 ); + + return _mm_or_si128( _mm_or_si128(rb, g), a); +} + +template +static size_t ColorspaceConvertBuffer555To8888Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, const size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To8888Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer555To6665Opaque_SSE2(const u16 *__restrict src, u32 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + v128u32 dstConvertedLo, dstConvertedHi; + ColorspaceConvert555To6665Opaque_SSE2(src_vec128, dstConvertedLo, dstConvertedHi); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_storeu_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); + } + else + { + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 0)), dstConvertedLo); + _mm_store_si128((v128u32 *)(dst+i+(sizeof(v128u32)/sizeof(u32) * 1)), dstConvertedHi); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To6665_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert8888To6665_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To8888_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert6665To8888_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer8888To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert8888To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer6665To5551_SSE2(const u32 *__restrict src, u16 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_loadu_si128((v128u32 *)(src+i)), _mm_loadu_si128((v128u32 *)(src+i+4))) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), ColorspaceConvert6665To5551_SSE2(_mm_load_si128((v128u32 *)(src+i)), _mm_load_si128((v128u32 *)(src+i+4))) ); + } + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo8888Opaque_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_SSE2(_mm_loadu_si128((v128u32 *)(src+i))) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), ColorspaceConvert888XTo8888Opaque_SSE2(_mm_load_si128((v128u32 *)(src+i))) ); + } + } + + return i; +} + +#ifdef ENABLE_SSSE3 + +template +size_t ColorspaceConvertBuffer555XTo888_SSSE3(const u16 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u16 src_v128u16[2]; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=((sizeof(v128u16)/sizeof(u16)) * 2)) + { + if (IS_UNALIGNED) + { + src_v128u16[0] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) ); + src_v128u16[1] = _mm_loadu_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) ); + } + else + { + src_v128u16[0] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 0)) ); + src_v128u16[1] = _mm_load_si128( (v128u16 *)(src + i + ((sizeof(v128u16)/sizeof(u16)) * 1)) ); + } + + v128u16 rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[0], 11), _mm_srli_epi16(src_v128u16[0], 7)), _mm_set1_epi16(0xF8F8) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(src_v128u16[0], 2), _mm_set1_epi16(0x00F8) ); + src_v128u32[0] = _mm_unpacklo_epi16(rb, g); + src_v128u32[1] = _mm_unpackhi_epi16(rb, g); + + rb = _mm_and_si128( _mm_or_si128(_mm_slli_epi16(src_v128u16[1], 11), _mm_srli_epi16(src_v128u16[1], 7)), _mm_set1_epi16(0xF8F8) ); + g = _mm_and_si128( _mm_srli_epi16(src_v128u16[1], 2), _mm_set1_epi16(0x00F8) ); + src_v128u32[2] = _mm_unpacklo_epi16(rb, g); + src_v128u32[3] = _mm_unpackhi_epi16(rb, g); + + src_v128u32[0] = _mm_or_si128( src_v128u32[0], _mm_and_si128(_mm_srli_epi32(src_v128u32[0], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[1] = _mm_or_si128( src_v128u32[1], _mm_and_si128(_mm_srli_epi32(src_v128u32[1], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[2] = _mm_or_si128( src_v128u32[2], _mm_and_si128(_mm_srli_epi32(src_v128u32[2], 5), _mm_set1_epi32(0x00070707)) ); + src_v128u32[3] = _mm_or_si128( src_v128u32[3], _mm_and_si128(_mm_srli_epi32(src_v128u32[3], 5), _mm_set1_epi32(0x00070707)) ); + + if (SWAP_RB) + { + src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0) ); + src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9, 10, 8, 5, 6) ); + src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3, 13,14,12, 9) ); + src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(13,14,12, 9, 10, 8, 5, 6, 4, 1, 2, 0, 15,11, 7, 3) ); + } + else + { + src_v128u32[0] = _mm_shuffle_epi8( src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1) ); + src_v128u32[1] = _mm_shuffle_epi8( src_v128u32[1], _mm_set_epi8( 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8, 10, 9, 4, 6) ); + src_v128u32[2] = _mm_shuffle_epi8( src_v128u32[2], _mm_set_epi8(10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3, 12,14,13, 8) ); + src_v128u32[3] = _mm_shuffle_epi8( src_v128u32[3], _mm_set_epi8(12,14,13, 8, 10, 9, 4, 6, 5, 0, 2, 1, 15,11, 7, 3) ); + } + +#ifdef ENABLE_SSE4_1 + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + } +#else + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), src_v128u32[0]) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128( src_v128u32[3], _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } +#endif + } + + return i; +} + +template +size_t ColorspaceConvertBuffer888XTo888_SSSE3(const u32 *__restrict src, u8 *__restrict dst, size_t pixCountVec128) +{ + size_t i = 0; + v128u32 src_v128u32[4]; + + for (; i < pixCountVec128; i+=((sizeof(v128u32)/sizeof(u32)) * 4)) + { + if (IS_UNALIGNED) + { + src_v128u32[0] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) ); + src_v128u32[1] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) ); + src_v128u32[2] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) ); + src_v128u32[3] = _mm_loadu_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) ); + } + else + { + src_v128u32[0] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 0)) ); + src_v128u32[1] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 1)) ); + src_v128u32[2] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 2)) ); + src_v128u32[3] = _mm_load_si128( (v128u32 *)(src + i + ((sizeof(v128u32)/sizeof(u32)) * 3)) ); + } + + if (SWAP_RB) + { + src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2)); + src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8, 9,10, 4, 5)); + src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3, 12,13,14, 8)); + src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(12,13,14, 8, 9,10, 4, 5, 6, 0, 1, 2, 15,11, 7, 3)); + } + else + { + src_v128u32[0] = _mm_shuffle_epi8(src_v128u32[0], _mm_set_epi8(15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0)); + src_v128u32[1] = _mm_shuffle_epi8(src_v128u32[1], _mm_set_epi8( 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10, 9, 8, 6, 5)); + src_v128u32[2] = _mm_shuffle_epi8(src_v128u32[2], _mm_set_epi8( 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3, 14,13,12,10)); + src_v128u32[3] = _mm_shuffle_epi8(src_v128u32[3], _mm_set_epi8(14,13,12,10, 9, 8, 6, 5, 4, 2, 1, 0, 15,11, 7, 3)); + } + +#ifdef ENABLE_SSE4_1 + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_blend_epi16(src_v128u32[0], src_v128u32[1], 0xC0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_blend_epi16(src_v128u32[1], src_v128u32[2], 0xF0) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_blend_epi16(src_v128u32[2], src_v128u32[3], 0xFC) ); + } +#else + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_storeu_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } + else + { + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 0)), _mm_or_si128(_mm_and_si128(src_v128u32[1], _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[0], _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 1)), _mm_or_si128(_mm_and_si128(src_v128u32[2], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000)), _mm_and_si128(src_v128u32[1], _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF))) ); + _mm_store_si128( (v128u8 *)(dst + (i * 3) + (sizeof(v128u32) * 2)), _mm_or_si128(_mm_and_si128(src_v128u32[3], _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000)), _mm_and_si128(src_v128u32[2], _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF))) ); + } +#endif + } + + return i; +} + +#endif + +template +size_t ColorspaceCopyBuffer16_SSE2(const u16 *src, u16 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u16)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(src+i)) : _mm_load_si128((v128u16 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2(src_vec128)); + } + else + { + _mm_store_si128((v128u16 *)(dst+i), ColorspaceCopy16_SSE2(src_vec128)); + } + } + + return i; +} + +template +size_t ColorspaceCopyBuffer32_SSE2(const u32 *src, u32 *dst, size_t pixCountVec128) +{ + if (!SWAP_RB) + { + memcpy(dst, src, pixCountVec128 * sizeof(u32)); + return pixCountVec128; + } + + size_t i = 0; + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 src_vec128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(src+i)) : _mm_load_si128((v128u32 *)(src+i)); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2(src_vec128)); + } + else + { + _mm_store_si128((v128u32 *)(dst+i), ColorspaceCopy32_SSE2(src_vec128)); + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer16_SSE2(u16 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + const v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); + const v128u16 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ); + + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), tempDst); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) ); + } + else + { + _mm_store_si128( (v128u16 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u16 *)(dst+i)), _mm_set1_epi16(0x8000)) ); + } + } + } + else + { + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u16)/sizeof(u16))) + { + v128u16 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u16 *)(dst+i)) : _mm_load_si128((v128u16 *)(dst+i)); + v128u16 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x7C00)), 10), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi16(0x0E30)), _mm_slli_epi16(_mm_and_si128(dst_v128, _mm_set1_epi16(0x001F)), 10))), _mm_and_si128(dst_v128, _mm_set1_epi16(0x8000)) ) : dst_v128; + + v128u16 r = _mm_and_si128( tempDst, _mm_set1_epi16(0x001F) ); + v128u16 g = _mm_and_si128( _mm_srli_epi16(tempDst, 5), _mm_set1_epi16(0x001F) ); + v128u16 b = _mm_and_si128( _mm_srli_epi16(tempDst, 10), _mm_set1_epi16(0x001F) ); + v128u16 a = _mm_and_si128( tempDst, _mm_set1_epi16(0x8000) ); + + r = _mm_mulhi_epu16(r, intensity_v128); + g = _mm_slli_epi16( _mm_mulhi_epu16(g, intensity_v128), 5 ); + b = _mm_slli_epi16( _mm_mulhi_epu16(b, intensity_v128), 10 ); + + tempDst = _mm_or_si128( _mm_or_si128( _mm_or_si128(r, g), b), a); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u16 *)(dst+i), tempDst); + } + else + { + _mm_store_si128((v128u16 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceApplyIntensityToBuffer32_SSE2(u32 *dst, size_t pixCountVec128, float intensity) +{ + size_t i = 0; + + if (intensity > 0.999f) + { + if (SWAP_RB) + { + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + const v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); +#ifdef ENABLE_SSSE3 + const v128u32 tempDst = _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)); +#else + const v128u32 tempDst = _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ); +#endif + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), tempDst); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), tempDst); + } + } + } + else + { + return pixCountVec128; + } + } + else if (intensity < 0.001f) + { + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + if (IS_UNALIGNED) + { + _mm_storeu_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_loadu_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) ); + } + else + { + _mm_store_si128( (v128u32 *)(dst+i), _mm_and_si128(_mm_load_si128((v128u32 *)(dst+i)), _mm_set1_epi32(0xFF000000)) ); + } + } + } + else + { + const v128u16 intensity_v128 = _mm_set1_epi16( (u16)(intensity * (float)(0xFFFF)) ); + + for (; i < pixCountVec128; i+=(sizeof(v128u32)/sizeof(u32))) + { + v128u32 dst_v128 = (IS_UNALIGNED) ? _mm_loadu_si128((v128u32 *)(dst+i)) : _mm_load_si128((v128u32 *)(dst+i)); +#ifdef ENABLE_SSSE3 + v128u32 tempDst = (SWAP_RB) ? _mm_shuffle_epi8(dst_v128, _mm_set_epi8(15,12,13,14, 11,8,9,10, 7,4,5,6, 3,0,1,2)) : dst_v128; +#else + v128u32 tempDst = (SWAP_RB) ? _mm_or_si128( _mm_or_si128(_mm_srli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x00FF0000)), 16), _mm_or_si128(_mm_and_si128(dst_v128, _mm_set1_epi32(0x0000FF00)), _mm_slli_epi32(_mm_and_si128(dst_v128, _mm_set1_epi32(0x000000FF)), 16))), _mm_and_si128(dst_v128, _mm_set1_epi32(0xFF000000)) ) : dst_v128; +#endif + + v128u16 rb = _mm_and_si128( tempDst, _mm_set1_epi32(0x00FF00FF) ); + v128u16 g = _mm_and_si128( _mm_srli_epi32(tempDst, 8), _mm_set1_epi32(0x000000FF) ); + v128u32 a = _mm_and_si128( tempDst, _mm_set1_epi32(0xFF000000) ); + + rb = _mm_mulhi_epu16(rb, intensity_v128); + g = _mm_slli_epi32( _mm_mulhi_epu16( g, intensity_v128), 8 ); + + tempDst = _mm_or_si128( _mm_or_si128(rb, g), a); + + if (IS_UNALIGNED) + { + _mm_storeu_si128((v128u32 *)(dst+i), tempDst); + } + else + { + _mm_store_si128((v128u32 *)(dst+i), tempDst); + } + } + } + + return i; +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To8888Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +template +size_t ColorspaceHandler_SSE2::ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555To6665Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To6665_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To8888_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer8888To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer6665To5551_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo8888Opaque_SSE2(src, dst, pixCount); +} + +#ifdef ENABLE_SSSE3 + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer555XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const +{ + return ColorspaceConvertBuffer888XTo888_SSSE3(src, dst, pixCount); +} + +#endif + +size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer16_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const +{ + return ColorspaceCopyBuffer32_SSE2(src, dst, pixCount); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer16_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +size_t ColorspaceHandler_SSE2::ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const +{ + return ColorspaceApplyIntensityToBuffer32_SSE2(dst, pixCount, intensity); +} + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); + +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); + +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); + +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); + +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); + +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); + +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); + +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); + +#endif // ENABLE_SSE2 diff --git a/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.h b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.h new file mode 100644 index 0000000..094dc51 --- /dev/null +++ b/FSCHMasterEditor/nds/colorspacehandler/colorspacehandler_SSE2.h @@ -0,0 +1,116 @@ +/* + Copyright (C) 2016-2021 DeSmuME team + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with the this software. If not, see . + */ + +#ifndef COLORSPACEHANDLER_SSE2_H +#define COLORSPACEHANDLER_SSE2_H + +#include "colorspacehandler.h" + +#ifndef ENABLE_SSE2 + #warning This header requires SSE2 support. +#else + +template void ColorspaceConvert555To8888_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo888X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665_SSE2(const v128u16 &srcColor, const v128u16 &srcAlphaBits, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555XTo666X_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To8888Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template void ColorspaceConvert555To6665Opaque_SSE2(const v128u16 &srcColor, v128u32 &dstLo, v128u32 &dstHi); +template v128u32 ColorspaceConvert8888To6665_SSE2(const v128u32 &src); +template v128u32 ColorspaceConvert6665To8888_SSE2(const v128u32 &src); +template v128u16 ColorspaceConvert8888To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u16 ColorspaceConvert6665To5551_SSE2(const v128u32 &srcLo, const v128u32 &srcHi); +template v128u32 ColorspaceConvert888XTo8888Opaque_SSE2(const v128u32 &src); + +template v128u16 ColorspaceCopy16_SSE2(const v128u16 &src); +template v128u32 ColorspaceCopy32_SSE2(const v128u32 &src); + +template v128u16 ColorspaceApplyIntensity16_SSE2(const v128u16 &src, float intensity); +template v128u32 ColorspaceApplyIntensity32_SSE2(const v128u32 &src, float intensity); + +class ColorspaceHandler_SSE2 : public ColorspaceHandler +{ +public: + ColorspaceHandler_SSE2() {}; + + template size_t ConvertBuffer555To8888Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To8888Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + template size_t ConvertBuffer555To6665Opaque(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + template size_t ConvertBuffer555To6665Opaque_SwapRB_IsUnaligned(const u16 *__restrict src, u32 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer8888To6665(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer8888To6665_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer6665To8888(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer6665To8888_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ConvertBuffer8888To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer8888To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer6665To5551(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer6665To5551_SwapRB_IsUnaligned(const u32 *__restrict src, u16 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo8888Opaque(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + size_t ConvertBuffer888XTo8888Opaque_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + +#ifdef ENABLE_SSSE3 + size_t ConvertBuffer555XTo888(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer555XTo888_SwapRB_IsUnaligned(const u16 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + + size_t ConvertBuffer888XTo888(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; + size_t ConvertBuffer888XTo888_SwapRB_IsUnaligned(const u32 *__restrict src, u8 *__restrict dst, size_t pixCount) const; +#endif + + size_t CopyBuffer16_SwapRB(const u16 *src, u16 *dst, size_t pixCount) const; + size_t CopyBuffer16_SwapRB_IsUnaligned(const u16 *src, u16 *dst, size_t pixCount) const; + + size_t CopyBuffer32_SwapRB(const u32 *src, u32 *dst, size_t pixCount) const; + size_t CopyBuffer32_SwapRB_IsUnaligned(const u32 *src, u32 *dst, size_t pixCount) const; + + size_t ApplyIntensityToBuffer16(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer16_SwapRB_IsUnaligned(u16 *dst, size_t pixCount, float intensity) const; + + size_t ApplyIntensityToBuffer32(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; + size_t ApplyIntensityToBuffer32_SwapRB_IsUnaligned(u32 *dst, size_t pixCount, float intensity) const; +}; + +#endif // ENABLE_SSE2 + +#endif /* COLORSPACEHANDLER_SSE2_H */ diff --git a/FSCHMasterEditor/nds/retro_inline.h b/FSCHMasterEditor/nds/retro_inline.h new file mode 100644 index 0000000..bdc763c --- /dev/null +++ b/FSCHMasterEditor/nds/retro_inline.h @@ -0,0 +1,39 @@ +/* Copyright (C) 2010-2016 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (retro_inline.h). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __LIBRETRO_SDK_INLINE_H +#define __LIBRETRO_SDK_INLINE_H + +#ifndef INLINE + +#if defined(_WIN32) || defined(__INTEL_COMPILER) +#define INLINE __inline +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L +#define INLINE inline +#elif defined(__GNUC__) +#define INLINE __inline__ +#else +#define INLINE +#endif + +#endif +#endif diff --git a/FSCHMasterEditor/nds/retro_miscellaneous.h b/FSCHMasterEditor/nds/retro_miscellaneous.h new file mode 100644 index 0000000..8f1db51 --- /dev/null +++ b/FSCHMasterEditor/nds/retro_miscellaneous.h @@ -0,0 +1,194 @@ +/* Copyright (C) 2010-2016 The RetroArch team + * + * --------------------------------------------------------------------------------------- + * The following license statement only applies to this file (retro_miscellaneous.h). + * --------------------------------------------------------------------------------------- + * + * Permission is hereby granted, free of charge, + * to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __RARCH_MISCELLANEOUS_H +#define __RARCH_MISCELLANEOUS_H + +#include +#include + +#if defined(__CELLOS_LV2__) && !defined(__PSL1GHT__) +#include +#elif defined(XENON) +#include