Compare commits

..

3 Commits

Author SHA1 Message Date
gdkchan
b34de74f81 Avoid adding shader buffer descriptors for constant buffers that are not used (#3478)
* Avoid adding shader buffer descriptors for constant buffers that are not used

* Shader cache version
2022-07-23 11:15:58 -03:00
riperiperi
5811d121df Avoid scaling 2d textures that could be used as 3d (#3464) 2022-07-15 09:24:13 -03:00
Logan Stromberg
6eb85e846f Reduce some unnecessary allocations in DMA handler (#2886)
* experimental changes to try and reduce allocations in kernel threading and DMA handler

* Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies

* forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression.

* removing dev log I left in

* Generalizing the DMA linear fast path to cases other than 1bpp copies

* revert kernel changes

* revert whitespace

* remove unneeded references

* PR feedback

Co-authored-by: Logan Stromberg <lostromb@microsoft.com>
Co-authored-by: gdk <gab.dark.100@gmail.com>
2022-07-14 15:45:56 -03:00
8 changed files with 55 additions and 29 deletions

View File

@@ -208,7 +208,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
}
ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
@@ -262,43 +261,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma
target.SynchronizeMemory();
target.SetData(data);
target.SignalModified();
return;
}
else if (srcCalculator.LayoutMatches(dstCalculator))
{
srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely.
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
// No layout conversion has to be performed, just copy the data entirely.
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
return;
}
}
unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
{
fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
if (srcLinear && dstLinear && srcBpp == dstBpp)
{
byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
byte* srcBase = srcPtr - srcBaseOffset;
// Optimized path for purely linear copies - we don't need to calculate every single byte offset,
// and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
int srcOffset = srcCalculator.GetOffset(srcRegionX);
int dstOffset = dstCalculator.GetOffset(dstRegionX);
srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
.CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
}
}
else
{
fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
{
byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
byte* srcBase = srcPtr - srcBaseOffset;
for (int x = 0; x < xCount; x++)
for (int y = 0; y < yCount; y++)
{
int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
*(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
for (int x = 0; x < xCount; x++)
{
int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
*(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
}
}
}
}
return true;
}
// OPT: This allocates a (potentially) huge temporary array and then copies an existing
// region of memory into it, data that might get overwritten entirely anyways. Ideally this should
// all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
bool _ = srcBpp switch
{
1 => Convert<byte>(dstSpan, srcSpan),

View File

@@ -174,6 +174,14 @@ namespace Ryujinx.Graphics.Gpu.Image
}
}
if (info.Width == info.Height * info.Height)
{
// Possibly used for a "3D texture" drawn onto a 2D surface.
// Some games do this to generate a tone mapping LUT without rendering into 3D texture slices.
return false;
}
return true;
}

View File

@@ -21,7 +21,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache
private const ushort FileFormatVersionMajor = 1;
private const ushort FileFormatVersionMinor = 1;
private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor;
private const uint CodeGenVersion = 3457;
private const uint CodeGenVersion = 3478;
private const string SharedTocFileName = "shared.toc";
private const string SharedDataFileName = "shared.data";

View File

@@ -45,12 +45,12 @@ namespace Ryujinx.Graphics.Shader.Instructions
if (isFP64)
{
return context.PackDouble2x32(
context.Config.CreateCbuf(cbufSlot, cbufOffset),
context.Config.CreateCbuf(cbufSlot, cbufOffset + 1));
Cbuf(cbufSlot, cbufOffset),
Cbuf(cbufSlot, cbufOffset + 1));
}
else
{
return context.Config.CreateCbuf(cbufSlot, cbufOffset);
return Cbuf(cbufSlot, cbufOffset);
}
}

View File

@@ -300,6 +300,11 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
if (operand.Type != OperandType.LocalVariable)
{
if (operand.Type == OperandType.ConstantBuffer)
{
Config.SetUsedConstantBuffer(operand.GetCbufSlot());
}
return new AstOperand(operand);
}

View File

@@ -68,7 +68,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
{
Operand addrLow = operation.GetSource(0);
Operand baseAddrLow = config.CreateCbuf(0, GetStorageCbOffset(config.Stage, storageIndex));
Operand baseAddrLow = Cbuf(0, GetStorageCbOffset(config.Stage, storageIndex));
Operand baseAddrTrunc = Local();
@@ -152,7 +152,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
{
Operand addrLow = operation.GetSource(0);
Operand baseAddrLow = config.CreateCbuf(0, UbeBaseOffset + storageIndex * StorageDescSize);
Operand baseAddrLow = Cbuf(0, UbeBaseOffset + storageIndex * StorageDescSize);
Operand baseAddrTrunc = Local();

View File

@@ -75,9 +75,9 @@ namespace Ryujinx.Graphics.Shader.Translation
int cbOffset = GetStorageCbOffset(config.Stage, slot);
Operand baseAddrLow = config.CreateCbuf(0, cbOffset);
Operand baseAddrHigh = config.CreateCbuf(0, cbOffset + 1);
Operand size = config.CreateCbuf(0, cbOffset + 2);
Operand baseAddrLow = Cbuf(0, cbOffset);
Operand baseAddrHigh = Cbuf(0, cbOffset + 1);
Operand size = Cbuf(0, cbOffset + 2);
Operand offset = PrependOperation(Instruction.Subtract, addrLow, baseAddrLow);
Operand borrow = PrependOperation(Instruction.CompareLessU32, addrLow, baseAddrLow);

View File

@@ -360,12 +360,6 @@ namespace Ryujinx.Graphics.Shader.Translation
UsedFeatures |= flags;
}
public Operand CreateCbuf(int slot, int offset)
{
SetUsedConstantBuffer(slot);
return OperandHelper.Cbuf(slot, offset);
}
public void SetUsedConstantBuffer(int slot)
{
_usedConstantBuffers |= 1 << slot;