; MMX QSound module
;   by Daniel Moreno (ComaC) - 2001  < comac2k@teleline.es >
;
; QscChan mixes one channel using 4 points interpolation. The volume of the
; resulting audio is 256 times louder, to have extra precision for mixing
; all the channels.
;
; Qsc_MMX_AdaptBuff adapts the buffer to classical 16 bit stereo format,
; correcting the volume.


[BITS 32]

section .text

global ChannelMix_8S
global _ChannelMix_8S

global ChannelMix_8U
global _ChannelMix_8U

global AdaptSoundBuff
global _AdaptSoundBuff

global AdaptSoundBuff_Add
global _AdaptSoundBuff_Add

extern _PrecalcData
PrecalcMMX equ _PrecalcData

; This macro is used from the other functions. It contains the main mixing
; bucle for 8 bit samples. At its input it expects:
;
; EAX -> pointer to sample buffer
; EBX -> index inside sample (current position)
; ECX -> index of the first sample that does NOT have to play (LoopEnd)
; ESI -> pointer to destination buffer
; mm3 -> Volumes Low = Left, High = Right
; Param 1 -> byte containing active channel flag
; Param 2 -> dword containing position increment
; Param 3 -> dword containing loop length
; Param 4 -> dword containing size of out buffer in samples
; Param 5 -> instructions to do to unpacked samples, or text NONE if none
;
; At exit:
;
; EBX -> next position to play
; EDX -> destroyed
; [Param 2] -> updated acordingly
; [Param 4] -> zero
; mm0 -> destroyed
; mm1 -> destroyed

%macro MIX_Bukle_8 4

%define INCR %1
%define LOOP %2
%define BUFLEN %3

	cmp	EBX, ECX	; Past the end?
	jl %%Sigue		; No? -> Continue

%%LoopCheck:
	cmp	LOOP, 0x1000	; > 0? -> There is loop.
	jg	%%LoopSample	; Yes -> Loop Sample

	cmp	LOOP, 0
	je	%%StopSample

	sub	EBX, LOOP		; reset looppoint
	and	EBX, 0xFFFFF000
	jmp	%%End_StayOn	; exit function

%%LoopSample:
	sub	EBX, LOOP		; Yes? -> Loop...
	and	EBX, 0xFFFFF000
	jmp %%Sigue

%%StopSample:
	xor	EBX, EBX
    xor EAX, EAX		; set bKey to zero
	jmp	%%End			; exit function

%%Bukle:
	cmp	EBX, ECX		; Past the end?
	jge %%LoopCheck		; No? -> Continue

%%Sigue:
    mov EDX, EBX
	and	EBX, (0xFFFF << 12)	; QSound banks are 0x10000 bytes
	shr EBX, 12		; EBX = Integer(nPos)

	pxor	mm0, mm0	; Zero mm0 (to do zero extension of sample)
	punpcklbw mm0, [EAX+EBX]; Load & unpack samples to 16 bit

	mov	EBX, EDX
	and	EDX, 0x0FFF	; EDX = Decimal(nPos)

	; Execute sign adaptation instructions if necessary:

%ifnidni %4, NONE
	pxor	mm0, %4
%endif

	pmaddwd	mm0, [PrecalcMMX + EDX*8] ; Interpolate
	movq	mm1, mm0	; We need High(mm0) + Low(mm0)
	psrlq	mm0, 32		; mm0 = High(mm0)
	paddd	mm0, mm1	; mm0 = Sample interpolated * 16384
	psrad	mm0, 14		; mm0 = Sample interpolated
	packssdw mm0, mm0	; Hi(mm0) = sample, Low(mm0) = sample
	add	EBX, INCR		; Advance counters...
	pmaddwd mm0, mm3	; Hi(mm0) = SampleR*256, Low(mm0) = SampleL*256
	dec	BUFLEN			; 1 sample less left

	paddd	mm0, [ESI]	; Add to buffer
	movq	[ESI], mm0	; Store result on buffer

	lea ESI, [ESI + 8]
	jnz	%%Bukle			; Continue if there are more samples

%%End_StayOn:
	mov EAX, 1			; leave bKey at 1
%%End:
	emms

%endmacro



; Parameters to ChannelMix_8S/8U:

%define BuffDest	dword [EBP+8]
%define BuffLen		dword [EBP+12]
%define SampleBuff	dword [EBP+16]
%define LoopEnd		dword [EBP+20]
%define PosPtr		dword [EBP+24]
%define Volumes		qword [EBP+28]
%define LoopLen		dword [EBP+36]
%define IncrPos		dword [EBP+40]

_ChannelMix_8S:
ChannelMix_8S:

	push	EBP
	mov	EBP, ESP

	push	EBX
	push	ESI
	push	EDI

	mov	EAX, SampleBuff		; EAX = Sample Buffer
	mov	EBX, PosPtr

	movq	mm3, Volumes		; mm3 = Volumes

	dec	EAX
	mov	EBX, [EBX]		; EBX = Sample Position
	mov	ECX, LoopEnd		; ECX = LoopEnd
	mov	ESI, BuffDest		; ESI = destionation buffer
	mov	EDI, IncrPos		; EDI = IncrPos

	MIX_Bukle_8 EDI, dword LoopLen, dword BuffLen, NONE

	mov	ECX, PosPtr
	pop	EDI
	pop	ESI
	mov	[ECX], EBX		; Save position
	pop	EBX

	pop	EBP

	ret


_ChannelMix_8U:
ChannelMix_8U:

	push	EBP
	mov	EBP, ESP

	push	EBX
	push	ESI
	push	EDI

	mov	EAX, SampleBuff		; EAX = Sample Buffer
	mov	EBX, PosPtr

	movq	mm3, Volumes		; mm3 = Volumes
	movq	mm4, [XorSign]		; mm4 = Value to xor to adapt sign

	dec	EAX
	mov	EBX, [EBX]		; EBX = Sample Position
	mov	ECX, LoopEnd		; ECX = LoopEnd
	mov	ESI, BuffDest		; ESI = destionation buffer
	mov	EDI, IncrPos		; EDI = IncrPos

	MIX_Bukle_8 EDI, dword LoopLen, dword BuffLen, mm4

	mov	ECX, PosPtr
	pop	EDI
	pop	ESI
	mov	[ECX], EBX		; Save position
	pop	EBX

	pop	EBP

	ret



; Parameters to AdaptSoundBuff

%define BufSrc [EBP+8]
%define BufDest [EBP+12]
%define Len [EBP+16]

_AdaptSoundBuff:
AdaptSoundBuff:

	push	EBP
	mov	EBP, ESP

	mov	ECX, BufSrc	; ECX = Buff Src
	mov	EDX, BufDest	; EDX = Buff Dest

	mov	EAX, Len	; EAX = Length
	shr	EAX, 1
	je	.Bukle3
.Bukle2:
	movq	mm0, [ECX]	; Hi(mm0) = SampleR*256, Low(mm0) = SampleL*256
	movq	mm1, [ECX + 8]	; Load next sample too
	psrad	mm0, 8		; Hi(mm0) = Sample1R, Low(mm0) = Sample1L
	psrad	mm1, 8		; Hi(mm1) = Sample2R, Low(mm1) = Sample2L
	add	ECX, byte 16	; ECX -> next samples

	packssdw mm0, mm1	; We have both samples packed here
	dec	EAX		; 2 samples less left

	movq	[EDX], mm0	; Save the result

	lea EDX, [EDX + 8]	; EDX -> next sample
	jnz	.Bukle2		; Continue if there are more samples

.Bukle3:   	mov	EAX, Len	; EAX = Length
	test	EAX, 1
	je	.Bukle4

	; We need to handle an odd amount of samples.
	; This handles the last one
	movq	mm0, [ECX]
	psrad	mm0, 8
	packssdw mm0, mm0
	movd	[EDX], mm0

.Bukle4:	emms			; Done with MMX (AdaptBuf is the last step)

	pop	EBP

	ret


_AdaptSoundBuff_Add:
AdaptSoundBuff_Add:

	push	EBP
	mov	EBP, ESP

	mov	ECX, BufSrc	; ECX = Buff Src
	mov	EDX, BufDest	; EDX = Buff Dest

	mov	EAX, Len	; EAX = Length
	shr	EAX, 1
	je	.Bukle3

.Bukle2:
	movq	mm0, [ECX]	; Hi(mm0) = SampleR*256, Low(mm0) = SampleL*256
	movq	mm1, [ECX + 8]	; Load next sample too
	psrad	mm0, 8		; Hi(mm0) = Sample1R, Low(mm0) = Sample1L
	psrad	mm1, 8		; Hi(mm1) = Sample2R, Low(mm1) = Sample2L
	add	ECX, byte 16	; ECX -> next samples

	packssdw mm0, mm1	; Here we have the 2 samples packed

	paddsw  mm0, [EDX]	; Add to the contents of the buffer
	dec	EAX		; 2 samples less left

	movq	[EDX], mm0	; Save the result

	lea	EDX, [EDX + 8]	; EDX -> next 2 samples
	jnz	.Bukle2		; Continue if there are more samples

.Bukle3:
	mov	EAX, Len	; EAX = Length
	test	EAX, 1
	je	.Bukle4

	; We need to handle an odd amount of sample pairs.
	; This handles the last one
	movq	mm0, [ECX]
	psrad	mm0, 8
	packssdw mm0, mm0
	movd	mm1, [EDX]		; We don't want to read/write past the end of the buffer
	paddsw  mm0, mm1		;
	movd	[EDX], mm0		;

.Bukle4:
	emms			; Done with MMX (AdaptBuf is the last step)

	pop	EBP

	ret



section	.data

; Used to xor with the unpacked samples to change sign:
XorSign:	dw	0x8000, 0x8000, 0x8000, 0x8000

section	.bss

VolMMX:	resd	2
ActiveFlag:	resb	1

