I recently posted ("Subject: [PATCH] Optimise memset on i386" - sorry, I don't seem to have a route to lists.gnu.org at the moment so I can't post an archive link) about optimising GRUB's video initialisation, and hinted that it might be possible to do better by implementing MTRRs as well in order to allow the system to combine writes to video memory rather than taking a cache stall for every single write. I can report that, at least on the hardware I was using, it does make a significant difference: filling the screen with solid colour now takes 10 milliseconds rather than 160! This ended up shaving about a second off the boot time of the project I'm working on.
On Linux, you can tell whether this is likely to be the case by comparing the kernel log from startup with /proc/mtrr. For example, on my Dell Latitude D830 with BIOS version A02, the kernel log says: MTRR variable ranges enabled: 0 base 000000000 mask F80000000 write-back 1 base 080000000 mask FC0000000 write-back 2 base 0BF800000 mask FFF800000 uncachable 3 base 0BF700000 mask FFFF00000 uncachable 4 disabled 5 disabled 6 disabled 7 disabled ... while /proc/mtrr says: reg00: base=0x000000000 ( 0MB), size= 2048MB, count=1: write-back reg01: base=0x080000000 ( 2048MB), size= 1024MB, count=1: write-back reg02: base=0x0bf800000 ( 3064MB), size= 8MB, count=1: uncachable reg03: base=0x0bf700000 ( 3063MB), size= 1MB, count=1: uncachable reg04: base=0x0e0000000 ( 3584MB), size= 256MB, count=1: write-combining The extra write-combining entry there matches the video memory aperture (you can check this by comparing with /proc/iomem and 'lspci -vvnn'), and I think it's set up by the X driver. The following patch is against 1.98, since that's what I was developing on. I'm not posting this as a serious merge proposal right now so I haven't included a ChangeLog or anything; this is just an RFC, and might be useful for others interested in this kind of thing (Seth Goldberg commented on IRC that he had been looking into something similar). Flaws in this approach include: * Doesn't work with anything other than the generic Intel MTRR system (although modern AMD chips have this too) * Very simplistic MTRR handling: anything other than a card with a power-of-two memory region aligned on a same-power-of-two boundary will degrade to previous behaviour * Might break older cards where write-combining isn't permitted (I found an Intel paper saying such cards existed); in particular it's possible that some cards might put registers in that region * Entirely unconfigurable It's also probably in the wrong place in the tree; I imagine EFI drivers would want to do much the same thing, for example - and my patch has far too many magic constants. I did at least take care to disable any MTRR added by GRUB before starting the target kernel; who knows what effects that might have, especially on multiprocessor systems (although Linux does work around out-of-sync MTRRs across CPUs). Still, I'd like to know whether this is of general enough interest to merit polishing it up and maybe offering it as some kind of configurable option. I do think that this is a BIOS bug, but it seems to be a not uncommon one and it does have a pretty noticeable effect on GRUB's video performance. diff -Nur -x '*.orig' -x '*~' grub2-1.98/video/i386/pc/vbe.c grub2-1.98.new/video/i386/pc/vbe.c --- grub2-1.98/video/i386/pc/vbe.c 2010-06-24 21:52:06.000000000 +0000 +++ grub2-1.98.new/video/i386/pc/vbe.c 2010-06-24 22:12:05.000000000 +0000 @@ -28,6 +28,7 @@ #include <grub/misc.h> #include <grub/mm.h> #include <grub/video.h> +#include <grub/cpu/tsc.h> static int vbe_detected = -1; @@ -48,6 +49,7 @@ grub_uint32_t active_vbe_mode; grub_uint8_t *ptr; int index_color_mode; + int mtrr; char *offscreen_buffer; @@ -124,6 +126,157 @@ return GRUB_ERR_NONE; } +#define cpuid(num,a,b,c,d) \ + asm volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (num)) + +#define rdmsr(num,a,d) \ + asm volatile ("rdmsr" : "=a" (a), "=d" (d) : "c" (num)) + +#define wrmsr(num,lo,hi) \ + asm volatile ("wrmsr" : : "c" (num), "a" (lo), "d" (hi) : "memory") + +#define mtrr_base(reg) (0x200 + (reg) * 2) +#define mtrr_mask(reg) (0x200 + (reg) * 2 + 1) + +/* Try to set up a variable-range write-combining MTRR for a memory region. + This is best-effort; if it seems too hard, we just accept the performance + degradation rather than risking undefined behaviour. It is intended + exclusively to work around BIOS bugs, as the BIOS should already be + setting up a suitable MTRR. */ +static void +grub_vbe_enable_mtrr (grub_uint8_t *base, grub_size_t size) +{ + grub_uint32_t eax, ebx, ecx, edx; + grub_uint32_t features; + grub_uint32_t mtrrcap; + int var_mtrrs; + grub_uint32_t max_extended_cpuid; + grub_uint32_t maxphyaddr; + grub_uint64_t fb_base, fb_size; + grub_uint64_t size_bits, fb_mask; + grub_uint32_t bits_lo, bits_hi; + grub_uint64_t bits; + int i, first_unused = -1; + grub_uint32_t base_lo, base_hi, mask_lo, mask_hi; + + fb_base = (grub_uint64_t) (grub_size_t) base; + fb_size = (grub_uint64_t) size; + + /* Check that fb_base and fb_size can be represented using a single + MTRR. */ + + if (fb_base < (1 << 20)) + return; /* under 1MB, so covered by fixed-range MTRRs */ + if (fb_base >= (1LL << 36)) + return; /* over 36 bits, so out of range */ + if (fb_size < (1 << 12)) + return; /* variable-range MTRRs must cover at least 4KB */ + + size_bits = fb_size; + while (size_bits > 1) + size_bits >>= 1; + if (size_bits != 1) + return; /* not a power of two */ + + if (fb_base & (fb_size - 1)) + return; /* not aligned on size boundary */ + + fb_mask = ~(fb_size - 1); + + /* Check CPU capabilities. */ + + if (! grub_cpu_is_cpuid_supported ()) + return; + + cpuid (1, eax, ebx, ecx, edx); + features = edx; + if (! (features & 0x00001000)) /* MTRR */ + return; + + rdmsr (0xFE, eax, edx); + mtrrcap = eax; + if (! (mtrrcap & 0x00000400)) /* write-combining */ + return; + var_mtrrs = (mtrrcap & 0xFF); + + cpuid (0x80000000, eax, ebx, ecx, edx); + max_extended_cpuid = eax; + if (max_extended_cpuid >= 0x80000008) + { + cpuid (0x80000008, eax, ebx, ecx, edx); + maxphyaddr = (eax & 0xFF); + } + else + maxphyaddr = 36; + bits_lo = 0xFFFFF000; /* assume maxphyaddr >= 36 */ + bits_hi = (1 << (maxphyaddr - 32)) - 1; + bits = bits_lo | ((grub_uint64_t) bits_hi << 32); + + /* Check whether an MTRR already covers this region. If not, take an + unused one if possible. */ + for (i = 0; i < var_mtrrs; i++) + { + rdmsr (mtrr_mask (i), eax, edx); + mask_lo = eax; + mask_hi = edx; + if (mask_lo & 0x800) /* valid */ + { + grub_uint64_t real_base, real_mask; + + rdmsr (mtrr_base (i), eax, edx); + base_lo = eax; + base_hi = edx; + + real_base = ((grub_uint64_t) (base_hi & bits_hi) << 32) | + (base_lo & bits_lo); + real_mask = ((grub_uint64_t) (mask_hi & bits_hi) << 32) | + (mask_lo & bits_lo); + if (real_base < (fb_base + fb_size) && + real_base + (~real_mask & bits) >= fb_base) + return; /* existing MTRR overlaps this region */ + } + else if (first_unused < 0) + first_unused = i; + } + + if (first_unused < 0) + return; /* all MTRRs in use */ + + /* Set up the first unused MTRR we found. */ + rdmsr (mtrr_base (first_unused), eax, edx); + base_lo = eax; + base_hi = edx; + rdmsr (mtrr_mask (first_unused), eax, edx); + mask_lo = eax; + mask_hi = edx; + + base_lo = (base_lo & ~bits_lo & ~0xFF) | + (fb_base & bits_lo) | 0x01 /* WC */; + base_hi = (base_hi & ~bits_hi) | ((fb_base >> 32) & bits_hi); + wrmsr (mtrr_base (first_unused), base_lo, base_hi); + mask_lo = (mask_lo & ~bits_lo) | (fb_mask & bits_lo) | 0x800 /* valid */; + mask_hi = (mask_hi & ~bits_hi) | ((fb_mask >> 32) & bits_hi); + wrmsr (mtrr_mask (first_unused), mask_lo, mask_hi); + + framebuffer.mtrr = first_unused; +} + +static void +grub_vbe_disable_mtrr (int mtrr) +{ + grub_uint32_t eax, edx; + grub_uint32_t mask_lo, mask_hi; + + rdmsr (mtrr_mask (mtrr), eax, edx); + mask_lo = eax; + mask_hi = edx; + + mask_lo &= ~0x800 /* valid */; + wrmsr (mtrr_mask (mtrr), mask_lo, mask_hi); +} + grub_err_t grub_vbe_set_video_mode (grub_uint32_t vbe_mode, struct grub_vbe_mode_info_block *vbe_mode_info) @@ -355,6 +508,7 @@ /* Reset frame buffer. */ grub_memset (&framebuffer, 0, sizeof(framebuffer)); + framebuffer.mtrr = -1; return grub_video_fb_init (); } @@ -378,6 +532,11 @@ err = grub_video_fb_fini (); grub_free (framebuffer.offscreen_buffer); + if (framebuffer.mtrr >= 0) + { + grub_vbe_disable_mtrr (framebuffer.mtrr); + framebuffer.mtrr = -1; + } return err; } @@ -684,6 +843,9 @@ framebuffer.mode_info.blit_format = grub_video_get_blit_format (&framebuffer.mode_info); + grub_vbe_enable_mtrr (framebuffer.ptr, + controller_info.total_memory << 16); + /* Set up double buffering and targets. */ err = double_buffering_init (mode_type, mode_mask); if (err) @@ -774,6 +936,11 @@ grub_video_fb_fini (); grub_free (framebuffer.offscreen_buffer); + if (framebuffer.mtrr >= 0) + { + grub_vbe_disable_mtrr (framebuffer.mtrr); + framebuffer.mtrr = -1; + } return GRUB_ERR_NONE; } Thanks, -- Colin Watson [cjwat...@ubuntu.com] _______________________________________________ Grub-devel mailing list Grub-devel@gnu.org http://lists.gnu.org/mailman/listinfo/grub-devel