propaganda

Branchless Line Drawing

When it comes to drawing non-antialised lines, undoubtedly the very first approach that comes to mind is of course Bresenham’s line drawing algorithm.

I always dreaded implementing it, simply because it’s full of special cases; and special cases generally mean branches, which are the last things you’d want anywhere near tight loops.

A couple of weeks ago, I had a shower thought. Why not just use linear interpolation and simply interpolate between the two points?

Naturally, I was almost certain that someone, somewhere, already thought of this and perhaps even patented it as folks do with these things.

After a cursory search on the information super highway I confirmed that my gut instinct was right, however the implementations I found still used branches and superfluous divisions, so I decided to roll and share my own flavor.

void image_render_line(
	image_t *image,
	const point_t p0,
	const point_t p1,
	const color_t color
)
{
	color_t *pixels = image->pixels;
	const int w = image->w;

	const float x0 = p0.x;
	const float y0 = p0.y;
	const float dx = p1.x - x0;
	const float dy = p1.y - y0;

	const float step = 1.0f / fmaxf(fabsf(dx), fabsf(dy));

	for(float t = 0.0f; t < 1.0f; t += step)
	{
		const int x = x0 + dx * t;
		const int y = y0 + dy * t;

		pixels[x + y * w] = color;
	}
}

Let’s also take a peek at the disassembly, just for funsies no less.

 ; cc -O2 -ffast-math -masm=intel -S line.c
 image_render_line:
 	movq	r8, xmm0
 	movq	rax, xmm1
 	movdqa	xmm4, xmm1
 	mov	ecx, edi
 	shr	r8, 32
 	shr	rax, 32
 	subss	xmm4, xmm0
 	movdqa	xmm7, xmm0
 	movd	xmm3, r8d
 	movd	xmm1, eax
 	mov	edi, edx
 	movss	xmm5, DWORD PTR .LC2[rip]
 	subss	xmm1, xmm3
 	movss	xmm0, DWORD PTR .LC1[rip]
 	movaps	xmm6, xmm5
 	movaps	xmm2, xmm1
 	andps	xmm2, xmm0
 	andps	xmm0, xmm4
 	maxss	xmm0, xmm2
 	divss	xmm6, xmm0
 	pxor	xmm0, xmm0
 .L2:
 	movaps	xmm2, xmm1
 	mulss	xmm2, xmm0
 	addss	xmm2, xmm3
 	cvttss2si	eax, xmm2
 	movaps	xmm2, xmm4
 	mulss	xmm2, xmm0
 	addss	xmm0, xmm6
 	imul	eax, ecx
 	addss	xmm2, xmm7
 	cvttss2si	edx, xmm2
 	add	eax, edx
 	comiss	xmm5, xmm0
 	cdqe
 	mov	DWORD PTR [rsi+rax*4], edi
 	ja	.L2
 	ret

What’s the catch? There must be some caveats, right?

First of all, there is no clipping, so you must ensure that both points are within the viewport. In other words, the following must be true:

p0.x >= 0 && p0.x <= w - 1
p0.y >= 0 && p0.y <= h - 1
p1.x >= 0 && p1.x <= w - 1
p1.y >= 0 && p1.y <= h - 1

Second, it doesn’t ensure that t = 1.0f which means that line is not guaranteed reach the coordinates of the endpoint.

This of course could be addressed by drawing one additional point or simply changing t < 1.0f to t <= 1.0f and then making sure that t never goes beyond 1.0f.

Third, it doesn’t handle the case when both points are in the same position, in other words p0 == p1.

This could be solved as well relatively easily, by clamping the step value before the division, like illustrated below.

const float step = 1.0f / fmax(fmax(fabs(dx), fabs(dy)), 1.0f);

That’s it. And now, here’s an actual full fledged example, which outputs a lovely wireframe render of Suzanne.

/*
	MIT LICENSE
	Copyright (c) 2023, Mihail Szabolcs
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <math.h>

#ifndef IMAGE_WIDTH
	#define IMAGE_WIDTH 1024
#endif

#ifndef IMAGE_HEIGHT
	#define IMAGE_HEIGHT 1024
#endif

#define IMAGE_SIZE IMAGE_WIDTH * IMAGE_HEIGHT

typedef union
{
	uint32_t bgra;
	struct
	{
		uint8_t b : 8;
		uint8_t g : 8;
		uint8_t r : 8;
		uint8_t a : 8;
	};
} color_t;

#pragma pack(push, 1)
typedef struct {
   uint8_t idlength;
   uint8_t colourmaptype;
   uint8_t datatypecode;
   uint16_t colourmaporigin;
   uint16_t colourmaplength;
   uint8_t colourmapdepth;
   uint16_t x_origin;
   uint16_t y_origin;
   uint16_t width;
   uint16_t height;
   uint8_t bitsperpixel;
   uint8_t imagedescriptor;
} image_header_t;
#pragma pack(pop)

typedef struct
{
	int w;
	int h;
	int size;
	color_t *pixels;
	image_header_t header;
} image_t;

typedef struct
{
	float x, y;
} point_t;

typedef struct
{
	float x, y, z;
} vertex_t;

typedef struct
{
	int v0, v1, v2;
} triangle_t;

typedef struct
{
	vertex_t *vertices;
	triangle_t *triangles;
	int num_triangles;
} mesh_t;

static inline point_t vertex_to_point(
    const vertex_t vertex,
    const float half_w,
    const float half_h
);

static void image_clear(image_t *image, const color_t color);
static bool image_save(image_t *image, const char *filename);
static void image_render_line(
    image_t *image,
    const point_t p0,
    const point_t p1,
    const color_t color
);
static void image_render_mesh(
    image_t *image,
    const mesh_t *mesh,
    const color_t color
);

static image_t *image = &(image_t) {
	.w = IMAGE_WIDTH,
	.h = IMAGE_HEIGHT,
	.size = IMAGE_SIZE,
	.pixels = (color_t[IMAGE_SIZE]) { 0, },
	.header = {
		.datatypecode = 2,
		.width = IMAGE_WIDTH,
		.height = IMAGE_HEIGHT,
		.y_origin = IMAGE_HEIGHT,
		.bitsperpixel = sizeof(color_t) << 3,
		.imagedescriptor = 8 | (1 << 5)
	}
};

static const mesh_t *mesh = &(mesh_t) {
	#include "mesh.h"
};

int main(int argc, char *argv[])
{
	if(argc != 2)
	{
		fprintf(stderr, "usage: %s output.tga\n", argv[0]);
		return EXIT_FAILURE;
	}

	image_clear(image, (color_t) { .a = 0xFF });

	image_render_mesh(
        image,
        mesh,
        (color_t) {
            .r = 0xFF,
            .g = 0xFF,
            .a = 0xFF
        }
    );

	if(!image_save(image, argv[1]))
	{
		fprintf(stderr, "error: could not save image `%s'\n", argv[1]);
		return EXIT_FAILURE;
	}

	return EXIT_SUCCESS;
}

static inline point_t vertex_to_point(
    const vertex_t vertex,
    const float half_w,
    const float half_h
)
{
	return (point_t) {
		.x = vertex.x * half_w + half_w,
		.y = vertex.y * half_h + half_h
	};
}

static void image_clear(image_t *image, const color_t color)
{
	for(color_t *pixel = image->pixels,
        *end = image->pixels + image->size; pixel != end; pixel++)
		*pixel = color;
}

static bool image_save(image_t *image, const char *filename)
{
	FILE *fp = fopen(filename, "wb");
	if(fp == NULL)
		return false;

	if(fwrite(
        &image->header,
        1,
        sizeof(image->header), fp) != sizeof(image->header))
	{
		fclose(fp);
		return false;
	}

	if(fwrite(
        image->pixels,
        sizeof(color_t),
        image->size, fp) != image->size)
	{
		fclose(fp);
		return false;
	}

	fclose(fp);
	return true;
}

static void image_render_line(
	image_t *image,
	const point_t p0,
	const point_t p1,
	const color_t color
)
{
	color_t *pixels = image->pixels;
	const int w = image->w;

	const float x0 = p0.x;
	const float y0 = p0.y;
	const float dx = p1.x - x0;
	const float dy = p1.y - y0;

	const float step = 1.0f / fmaxf(fabsf(dx), fabsf(dy));

	for(float t = 0.0f; t < 1.0f; t += step)
	{
		const int x = x0 + dx * t;
		const int y = y0 + dy * t;

		pixels[x + y * w] = color;
	}
}

static void image_render_mesh(
    image_t *image,
    const mesh_t *mesh,
    const color_t color
)
{
	const vertex_t *vertices = mesh->vertices;
	const float half_w = image->w * 0.5f;
	const float half_h = image->h * 0.5f;

	for(const triangle_t *tri = mesh->triangles,
        *end = mesh->triangles + mesh->num_triangles; tri != end; tri++)
	{
		const vertex_t v0 = vertices[tri->v0];
		const vertex_t v1 = vertices[tri->v1];
		const vertex_t v2 = vertices[tri->v2];

		const point_t p0 = vertex_to_point(v0, half_w, half_h);
		const point_t p1 = vertex_to_point(v1, half_w, half_h);
		const point_t p2 = vertex_to_point(v2, half_w, half_h);

		image_render_line(image, p0, p1, color);
		image_render_line(image, p1, p2, color);
		image_render_line(image, p2, p0, color);
	}
}

/* vim: set ts=4 sw=4 sts=4 noet: */

suzanne

What a thing of beauty, isn’t it? Those lovely jaggies, just be hitting different. I think so too.

If we really wanted to do away with the fmax, fabs and the division, we could borrow the fast inverse square root implementation from Q3 and optimize things even further.

float Q_rsqrt(float number)
{
 	long i;
 	float x2, y;
 	const float threehalfs = 1.5F;

 	x2 = number * 0.5F;
 	y  = number;
 	i  = * ( long * ) &y; // evil floating point bit level hacking
 	i  = 0x5f3759df - ( i >> 1 ); // what the fuck?
 	y  = * ( float * ) &i;
 	y  = y * ( threehalfs - ( x2 * y * y ) ); // 1st iteration

 	return y;
}

void image_render_line(
	image_t *image,
	const point_t p0,
	const point_t p1,
	const color_t color
)
{
	color_t *pixels = image->pixels;
	const int w = image->w;

	const float x0 = p0.x;
	const float y0 = p0.y;
	const float dx = p1.x - x0;
	const float dy = p1.y - y0;

	const float step = Q_rsqrt(dx * dx + dy * dy);

	for(float t = 0.0f; t < 1.0f; t += step)
	{
		const int x = x0 + dx * t;
		const int y = y0 + dy * t;

		pixels[x + y * w] = color;
	}
}

2023-06-25