arm/155214: [patch] MMC/SD IO slow on Atmel ARM with modern large SD cards (updated patch)

Ian Lepore freebsd at damnhippie.dyndns.org
Tue Apr 19 18:10:11 UTC 2011


The following reply was made to PR arm/155214; it has been noted by GNATS.

From: Ian Lepore <freebsd at damnhippie.dyndns.org>
To: bug-followup at FreeBSD.org, freebsd at damnhippie.dyndns.org
Cc:  
Subject: Re: arm/155214: [patch] MMC/SD IO slow on Atmel ARM with modern
 large SD cards (updated patch)
Date: Tue, 19 Apr 2011 12:06:43 -0600

 Index: sys/arm/at91/at91_mci.c
 ===================================================================
 --- sys/arm/at91/at91_mci.c.cvs_v1.18   2011-03-30 08:30:18.000000000 -0600
 +++ sys/arm/at91/at91_mci.c             2011-04-18 12:04:55.000000000 -0600
 @@ -67,7 +67,60 @@
  
  #include "opt_at91.h"
  
 -#define BBSZ	512
 +/* About running the MCI bus at 30mhz...
 + *
 + * Historically, the MCI bus has been run at 30mhz on systems with a 60mhz
 + * master clock, due to a bug in the mantissa table in dev/mmc.c making it
 + * appear that the card's max speed was always 30mhz.  Fixing that bug causes
 + * the mmc driver to request a 25mhz clock (as it should) and the logic in
 + * at91_mci_update_ios() picks the highest speed that doesn't exceed that limit.
 + * With a 60mhz MCK that would be 15mhz, and that's a real performance buzzkill
 + * when you've been getting away with 30mhz all along.
 + *
 + * By defining AT91_MCI_USE_30MHZ (or setting the 30mhz=1 device hint or sysctl)
 + * you can enable logic in at91_mci_update_ios() to set the mci bus to 30mhz
 + * when MCK is 60mhz and the requested speed is 25mhz.  This appears to work on
 + * virtually all SD cards, since it is what this driver has been doing by
 + * accident since day one.  I've seen modern SD cards run at 45mhz/1-bit in
 + * standard mode (high speed mode enable commands not sent) without problems.
 + *
 + * Speaking of high-speed mode, the rm9200 manual says the MCI device supports
 + * the SD v1.0 specification and can run up to 50mhz.  This is interesting in
 + * that the SD v1.0 spec caps the speed at 25mhz; high speed mode was added in
 + * the v1.10 spec.  Furthermore, high speed mode doesn't just crank up the
 + * clock, it alters the signal timing.  The rm9200 MCI device doesn't support
 + * these altered timings.  So while speeds over 25mhz may work, they only work
 + * in what the SD spec calls "default" speed mode, and it amounts to violating
 + * the spec by overclocking the bus.
 + *
 + * If you also enable 4-wire mode it's possible the 30mhz transfers will fail.
 + * If you have the USB host device and OHCI driver enabled it's g'teed to fail
 + * (I get intermittant overrun and underrun errors even at 15mhz 4-wire with
 + * OHCI). Note that you don't even need to have usb devices attached to the
 + * system, the errors begin to occur as soon as the OHCI driver sets the
 + * register bit to enable periodic transfers.  It appears (based on brief
 + * investigation) that the usb host controller uses so much ASB bandwidth that
 + * sometimes the DMA for MCI transfers doesn't get a bus grant in time and data
 + * gets dropped.  Adding even a modicum of network activity changes the symptom
 + * from intermittant to very frequent.
 + */
 +
 +#ifndef AT91_MCI_USE_30MHZ
 +#define AT91_MCI_USE_30MHZ 1
 +#endif
 +
 +/* Allocate 2 bounce buffers, each being sized to half the system default
 + * physical IO size.  That enables doing DFLTPHYS sized transfers at a time,
 + * with read transfers in particular being split into two operations so that we
 + * can overlap some of the byte-swapping needed due to the rm9200 erratum with
 + * the DMA for the second half of the transfer.
 + */
 +
 +#define BBCOUNT     2
 +#define BBSIZE      (DFLTPHYS/BBCOUNT)
 +#define MAX_BLOCKS  ((BBSIZE*BBCOUNT)/512)
 +
 +static int mci_debug;
  
  struct at91_mci_softc {
  	void *intrhand;			/* Interrupt handle */
 @@ -75,21 +128,26 @@
  	int sc_cap;
  #define	CAP_HAS_4WIRE		1	/* Has 4 wire bus */
  #define	CAP_NEEDS_BYTESWAP	2	/* broken hardware needing bounce */
 -	int flags;
  	int has_4wire;
 -#define CMD_STARTED	1
 -#define STOP_STARTED	2
 +	int use_30mhz;
 +	int flags;
 +#define PENDING_CMD	0x01
 +#define PENDING_STOP	0x02
 +#define CMD_MULTIREAD	0x10
 +#define CMD_MULTIWRITE	0x20
  	struct resource *irq_res;	/* IRQ resource */
  	struct resource	*mem_res;	/* Memory resource */
  	struct mtx sc_mtx;
  	bus_dma_tag_t dmatag;
 -	bus_dmamap_t map;
 -	int mapped;
  	struct mmc_host host;
  	int bus_busy;
  	struct mmc_request *req;
  	struct mmc_command *curcmd;
 -	char bounce_buffer[BBSZ];
 +	bus_dmamap_t bbuf_map[BBCOUNT];
 +	char      *  bbuf_vaddr[BBCOUNT]; /* bounce bufs in KVA space */
 +	uint32_t     bbuf_len[BBCOUNT];	  /* len currently queued for bounce buf */
 +	uint32_t     bbuf_curidx;	  /* which bbuf is the active DMA buffer */
 +	uint32_t     xfer_offset;	  /* offset so far into caller's buf */
  };
  
  static inline uint32_t
 @@ -123,6 +181,47 @@
  #define AT91_MCI_ASSERT_LOCKED(_sc)	mtx_assert(&_sc->sc_mtx, MA_OWNED);
  #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
  
 +static void 
 +at91_bswap_buf(struct at91_mci_softc *sc, void * dptr, void * sptr, uint32_t memsize)
 +{
 +	uint32_t * dst = (uint32_t *)dptr;
 +	uint32_t * src = (uint32_t *)sptr;
 +	uint32_t   i;
 +
 +	/* If the hardware doesn't need byte-swapping, let bcopy() do the work.
 +	 */
 +
 +	if (!(sc->sc_cap & CAP_NEEDS_BYTESWAP)) {
 +		bcopy(dptr, sptr, memsize);
 +		return;
 +	}
 +
 +	/* Nice performance boost for slightly unrolling this loop.
 +	 * (But very little extra boost for further unrolling it.)
 +	 */
 +
 +	for (i = 0; i < memsize; i += 16) {
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +	}
 +
 +	/* Mop up the last 1-3 words, if any. */
 +
 +	for (i = 0; i < (memsize & 0x0F); i += 4) {
 +		*dst++ = bswap32(*src++);
 +	}
 +}
 +
 +static void
 +at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 +{
 +	if (error != 0)
 +		return;
 +	*(bus_addr_t *)arg = segs[0].ds_addr;
 +}
 +
  static void
  at91_mci_pdc_disable(struct at91_mci_softc *sc)
  {
 @@ -137,15 +236,57 @@
  	WR4(sc, PDC_TNCR, 0);
  }
  
 +/* Reset the controller, then restore most of the current state.
 + *
 + * This is called after detecting an error.  It's also called after stopping a
 + * multi-block write, to un-wedge the device so that it will handle the NOTBUSY
 + * signal correctly.  See comments in at91_mci_stop_done() for more details.
 + */
 +static void at91_mci_reset(struct at91_mci_softc *sc)
 +{
 +	uint32_t mr;
 +	uint32_t sdcr;
 +	uint32_t dtor;
 +	uint32_t imr;
 +
 +	at91_mci_pdc_disable(sc);
 +
 +	/* save current state */
 +
 +	imr  = RD4(sc, MCI_IMR);
 +	mr   = RD4(sc, MCI_MR) & 0x7fff;
 +	sdcr = RD4(sc, MCI_SDCR);
 +	dtor = RD4(sc, MCI_DTOR);
 +
 +	/* reset the controller */
 +
 +	WR4(sc, MCI_IDR, 0xffffffff);
 +	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST);
 +
 +	/* restore state */
 +
 +	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
 +	WR4(sc, MCI_MR, mr);
 +	WR4(sc, MCI_SDCR, sdcr);
 +	WR4(sc, MCI_DTOR, dtor);
 +	WR4(sc, MCI_IER, imr);
 +
 +	/* Make sure sdio interrupts will fire.  Not sure why reading
 +	 * SR ensures that, but this is in the linux driver.
 +	 */
 +
 +	RD4(sc, MCI_SR);
 +}
 +
  static void
  at91_mci_init(device_t dev)
  {
  	struct at91_mci_softc *sc = device_get_softc(dev);
  
 -	WR4(sc, MCI_CR, MCI_CR_MCIEN);		/* Enable controller */
 +	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* Put the device into reset */
  	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
  	WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1);
 -	WR4(sc, MCI_MR, 0x834a);	// XXX GROSS HACK FROM LINUX
 +	WR4(sc, MCI_MR, 0x834a); // set PDCMODE, PWSDIV=3, CLKDIV=75
  #ifndef  AT91_MCI_SLOT_B
  	WR4(sc, MCI_SDCR, 0);			/* SLOT A, 1 bit bus */
  #else
 @@ -153,6 +294,11 @@
  	 * a two slot card that we know of. XXX */
  	WR4(sc, MCI_SDCR, 1);			/* SLOT B, 1 bit bus */
  #endif
 +	/* Enable controller, including power-save.  The slower clock of
 +	 * the power-save mode is only in effect when there is no transfer in
 +	 * progress, so it can be left in this mode all the time.
 +	 */
 +	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
  }
  
  static void
 @@ -167,8 +313,8 @@
  
  static int
  at91_mci_probe(device_t dev)
 -{
  
 +{
  	device_set_desc(dev, "MCI mmc/sd host bridge");
  	return (0);
  }
 @@ -180,6 +326,7 @@
  	struct sysctl_ctx_list *sctx;
  	struct sysctl_oid *soid;
  	device_t child;
 +	int i;
  	int err;
  
  	sc->dev = dev;
 @@ -193,48 +340,100 @@
  
  	AT91_MCI_LOCK_INIT(sc);
  
 -	/*
 -	 * Allocate DMA tags and maps
 +	at91_mci_fini(dev);
 +	at91_mci_init(dev);
 +
 +	/* Allocate DMA tags and maps and bounce buffers.
 +	 *
 +	 * The parms in the tag_create call cause the dmamem_alloc call to
 +	 * create each bounce buffer as a single contiguous buffer of BBSIZE
 +	 * bytes aligned to a 4096 byte boundary.
 +	 *
 +	 * Do not use DMA_COHERENT for these buffers because that maps the
 +	 * memory as non-cachable, which prevents cache line burst fills/writes,
 +	 * which is something we need since we're trying to overlap the
 +	 * byte-swapping with the DMA operations.
  	 */
 -	err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
 -	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1,
 -	    MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag);
 -	if (err != 0)
 -		goto out;
  
 -	err = bus_dmamap_create(sc->dmatag, 0,  &sc->map);
 +	err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0,
 +	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
 +	    BBSIZE, 1, MAXPHYS, 0, NULL, NULL, &sc->dmatag);
  	if (err != 0)
  		goto out;
  
 -	at91_mci_fini(dev);
 -	at91_mci_init(dev);
 +	for (i = 0; i < BBCOUNT; ++i) {
 +		err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr[i],
 +		    BUS_DMA_NOWAIT, &sc->bbuf_map[i]);
 +		if (err != 0)
 +			goto out;
 +	}
  
  	/*
  	 * Activate the interrupt
  	 */
 -	err = bus_setup_intr(dev, sc->irq_res, INTR_TYPE_MISC | INTR_MPSAFE,
 +	err = bus_setup_intr(dev, sc->irq_res, INTR_TYPE_BIO | INTR_MPSAFE,
  	    NULL, at91_mci_intr, sc, &sc->intrhand);
  	if (err) {
  		AT91_MCI_LOCK_DESTROY(sc);
  		goto out;
  	}
  
 +	/* Allow 4-wire to be initially set via #define.
 +	 * Allow a device hint to override that.
 +	 * Allow a sysctl to override that.
 +	 */
 +
 +#if defined(AT91_MCI_HAS_4WIRE) && AT91_MCI_HAS_4WIRE != 0
 +	sc->has_4wire = 1;
 +#else
 +	sc->has_4wire = 0;
 +#endif
 +
 +	resource_int_value(device_get_name(dev), device_get_unit(dev), 
 +			   "4wire", &sc->has_4wire);
 +
  	sctx = device_get_sysctl_ctx(dev);
  	soid = device_get_sysctl_tree(dev);
  	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "4wire",
  	    CTLFLAG_RW, &sc->has_4wire, 0, "has 4 wire SD Card bus");
  
 -#ifdef AT91_MCI_HAS_4WIRE
 -	sc->has_4wire = 1;
 -#endif
  	if (sc->has_4wire)
  		sc->sc_cap |= CAP_HAS_4WIRE;
  
 -	sc->host.f_min = at91_master_clock / 512;
 +	/* Allow use_30mhz to be initially set via #define.
 +	 * Allow a device hint to override that.
 +	 * Allow a sysctl to override that.
 +	 */
 +
 +#if defined(AT91_MCI_USE_30MHZ) && AT91_MCI_USE_30MHZ != 0
 +	sc->use_30mhz = 1;
 +#else
 +	sc->use_30mhz = 0;
 +#endif
 +
 +	resource_int_value(device_get_name(dev), device_get_unit(dev), 
 +			   "30mhz", &sc->use_30mhz);
 +
 +	sctx = device_get_sysctl_ctx(dev);
 +	soid = device_get_sysctl_tree(dev);
 +	SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "30mhz",
 +	    CTLFLAG_RW, &sc->use_30mhz, 0, "use 30mhz clock for 25mhz request");
 +
 +	/* Our real min freq is master_clock/512, but upper driver layers are
 +	 * going to set the min speed during card discovery, and the right speed
 +	 * for that is 400khz, so advertise a safe value just under that.
 +	 *
 +	 * For max speed, while the rm9200 manual says the max is 50mhz, it also
 +	 * says it supports only the SD v1.0 spec, which means the real limit is
 +	 * 25mhz. On the other hand, historical use has been to slightly violate
 +	 * the standard by running the bus at 30mhz.  For more information on
 +	 * that, see the comments at the top of this file.
 +	 */
 +
  	sc->host.f_min = 375000;
  	sc->host.f_max = at91_master_clock / 2;
 -	if (sc->host.f_max > 50000000)	
 -		sc->host.f_max = 50000000;	/* Limit to 50MHz */
 +	if (sc->host.f_max > 25000000)
 +		sc->host.f_max = 25000000;
  
  	sc->host.host_ocr = MMC_OCR_320_330 | MMC_OCR_330_340;
  	sc->host.caps = 0;
 @@ -252,8 +451,15 @@
  static int
  at91_mci_detach(device_t dev)
  {
 +	struct at91_mci_softc *sc = device_get_softc(dev);
 +
  	at91_mci_fini(dev);
  	at91_mci_deactivate(dev);
 +
 +	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[0], sc->bbuf_map[0]);
 +	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[1], sc->bbuf_map[1]);
 +	bus_dma_tag_destroy(sc->dmatag);
 +
  	return (EBUSY);	/* XXX */
  }
  
 @@ -293,7 +499,7 @@
  	sc->intrhand = 0;
  	bus_generic_detach(sc->dev);
  	if (sc->mem_res)
 -		bus_release_resource(dev, SYS_RES_IOPORT,
 +		bus_release_resource(dev, SYS_RES_MEMORY,
  		    rman_get_rid(sc->mem_res), sc->mem_res);
  	sc->mem_res = 0;
  	if (sc->irq_res)
 @@ -303,14 +509,6 @@
  	return;
  }
  
 -static void
 -at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 -{
 -	if (error != 0)
 -		return;
 -	*(bus_addr_t *)arg = segs[0].ds_addr;
 -}
 -
  static int
  at91_mci_update_ios(device_t brdev, device_t reqdev)
  {
 @@ -322,16 +520,31 @@
  	sc = device_get_softc(brdev);
  	host = &sc->host;
  	ios = &host->ios;
 -	// bus mode?
 +
 +	/* Calculate our closest available clock speed that doesn't exceed the
 +	 * requested speed.
 +	 *
 +	 * If the master clock is running at 60mhz and the requested bus speed
 +	 * is 25mhz and the use_30mhz flag is on, set clkdiv to zero to get a
 +	 * 30mhz mci clock. See comments near the top of the file for more info.
 +	 *
 +	 * Whatever we come up with, store it back into ios->clock so that the
 +	 * upper layer drivers can report the actual speed of the bus.
 +	 */
 +
  	if (ios->clock == 0) {
  		WR4(sc, MCI_CR, MCI_CR_MCIDIS);
  		clkdiv = 0;
  	} else {
 -		WR4(sc, MCI_CR, MCI_CR_MCIEN);
 -		if ((at91_master_clock % (ios->clock * 2)) == 0)
 +		WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
 +		if (sc->use_30mhz && at91_master_clock == 60000000 && 
 +		    ios->clock == 25000000)
 +			clkdiv = 0;
 +                else if ((at91_master_clock % (ios->clock * 2)) == 0)
  			clkdiv = ((at91_master_clock / ios->clock) / 2) - 1;
  		else
  			clkdiv = (at91_master_clock / ios->clock) / 2;
 +		ios->clock = at91_master_clock / ((clkdiv+1) * 2);
  	}
  	if (ios->bus_width == bus_width_4)
  		WR4(sc, MCI_SDCR, RD4(sc, MCI_SDCR) | MCI_SDCR_SDCBUS);
 @@ -346,137 +559,247 @@
  static void
  at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd)
  {
 -	uint32_t cmdr, ier = 0, mr;
 -	uint32_t *src, *dst;
 -	int i;
 +	uint32_t cmdr, mr;
  	struct mmc_data *data;
 -	void *vaddr;
 -	bus_addr_t paddr;
  
  	sc->curcmd = cmd;
  	data = cmd->data;
 -	cmdr = cmd->opcode;
  
  	/* XXX Upper layers don't always set this */
  	cmd->mrq = sc->req;
  
 +	/* Begin setting up command register. */
 +
 +	cmdr = cmd->opcode;
 +
 +	if (sc->host.ios.bus_mode == opendrain)
 +		cmdr |= MCI_CMDR_OPDCMD;
 +
 +	/* Set up response handling.  Allow max timeout for responses. */
 +
  	if (MMC_RSP(cmd->flags) == MMC_RSP_NONE)
  		cmdr |= MCI_CMDR_RSPTYP_NO;
  	else {
 -		/* Allow big timeout for responses */
  		cmdr |= MCI_CMDR_MAXLAT;
  		if (cmd->flags & MMC_RSP_136)
  			cmdr |= MCI_CMDR_RSPTYP_136;
  		else
  			cmdr |= MCI_CMDR_RSPTYP_48;
  	}
 -	if (cmd->opcode == MMC_STOP_TRANSMISSION)
 -		cmdr |= MCI_CMDR_TRCMD_STOP;
 -	if (sc->host.ios.bus_mode == opendrain)
 -		cmdr |= MCI_CMDR_OPDCMD;
 -	if (!data) {
 -		// The no data case is fairly simple
 +
 +	/* If there is no data transfer, just set up the right interrupt mask
 +	 * and start the command.
 +	 *
 +	 * The interrupt mask needs to be CMDRDY plus all non-data-transfer
 +	 * errors. It's important to leave the transfer-related errors out, to
 +	 * avoid spurious timeout or crc errors on a STOP command following a
 +	 * multiblock read.  When a multiblock read is in progress, sending a
 +	 * STOP in the middle of a block occasionally triggers such errors, but
 +	 * we're totally disinterested in them because we've already gotten all
 +	 * the data we wanted without error before sending the STOP command.
 +	 */
 +
 +	if (data == NULL) {
 +		uint32_t ier = MCI_SR_CMDRDY | 
 +                    MCI_SR_RTOE | MCI_SR_RENDE | 
 +		    MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE;
 +
  		at91_mci_pdc_disable(sc);
 -//		printf("CMDR %x ARGR %x\n", cmdr, cmd->arg);
 +
 +		if (cmd->opcode == MMC_STOP_TRANSMISSION)
 +			cmdr |= MCI_CMDR_TRCMD_STOP;
 +
 +		/* Ignore response CRC on CMD2 and ACMD41, per standard. */
 +
 +		if (cmd->opcode == MMC_SEND_OP_COND ||
 +		    cmd->opcode == ACMD_SD_SEND_OP_COND)
 +			ier &= ~MCI_SR_RCRCE;
 +
 +		if (mci_debug)
 +			printf("CMDR %x (opcode %d) ARGR %x no data\n", 
 +			    cmdr, cmd->opcode, cmd->arg);
 +
  		WR4(sc, MCI_ARGR, cmd->arg);
  		WR4(sc, MCI_CMDR, cmdr);
 -		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
 +		WR4(sc, MCI_IDR, 0xffffffff);
 +		WR4(sc, MCI_IER, ier);
  		return;
  	}
 +
 +	/* There is data, set up the transfer-related parts of the command. */
 +
  	if (data->flags & MMC_DATA_READ)
  		cmdr |= MCI_CMDR_TRDIR;
 +
  	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE))
  		cmdr |= MCI_CMDR_TRCMD_START;
 +
  	if (data->flags & MMC_DATA_STREAM)
  		cmdr |= MCI_CMDR_TRTYP_STREAM;
 -	if (data->flags & MMC_DATA_MULTI)
 +	else if (data->flags & MMC_DATA_MULTI) {
  		cmdr |= MCI_CMDR_TRTYP_MULTIPLE;
 -	// Set block size and turn on PDC mode for dma xfer and disable
 -	// PDC until we're ready.
 -	mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN;
 -	WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE);
 -	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 -	if (cmdr & MCI_CMDR_TRCMD_START) {
 -		if (cmdr & MCI_CMDR_TRDIR)
 -			vaddr = cmd->data->data;
 -		else {
 -			/* Use bounce buffer even if we don't need
 -			 * byteswap, since buffer may straddle a page
 -			 * boundry, and we don't handle multi-segment
 -			 * transfers in hardware.
 -			 * (page issues seen from 'bsdlabel -w' which
 -			 * uses raw geom access to the volume).
 -			 * Greg Ansley (gja (at) ansley.com)
 -			 */
 -			vaddr = sc->bounce_buffer;
 -			src = (uint32_t *)cmd->data->data;
 -			dst = (uint32_t *)vaddr;
 -			if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
 -				for (i = 0; i < data->len / 4; i++)
 -					dst[i] = bswap32(src[i]);
 -			} else
 -				memcpy(dst, src, data->len);
 -		}
 -		data->xfer_len = 0;
 -		if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, data->len,
 -		    at91_mci_getaddr, &paddr, 0) != 0) {
 -			cmd->error = MMC_ERR_NO_MEMORY;
 -			sc->req = NULL;
 -			sc->curcmd = NULL;
 -			cmd->mrq->done(cmd->mrq);
 -			return;
 -		}
 -		sc->mapped++;
 -		if (cmdr & MCI_CMDR_TRDIR) {
 -			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREREAD);
 -			WR4(sc, PDC_RPR, paddr);
 -			WR4(sc, PDC_RCR, data->len / 4);
 -			ier = MCI_SR_ENDRX;
 -		} else {
 -			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREWRITE);
 -			WR4(sc, PDC_TPR, paddr);
 -			WR4(sc, PDC_TCR, data->len / 4);
 -			ier = MCI_SR_TXBUFE;
 -		}
 +		sc->flags |= (data->flags & MMC_DATA_READ) ? 
 +				CMD_MULTIREAD : CMD_MULTIWRITE;
  	}
 -//	printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg);
 -	WR4(sc, MCI_ARGR, cmd->arg);
 -	if (cmdr & MCI_CMDR_TRCMD_START) {
 -		if (cmdr & MCI_CMDR_TRDIR) {
 +
 +	/* Disable PDC until we're ready.
 +	 *
 +	 * Set block size and turn on PDC mode for dma xfer.
 +	 * Note that the block size is the smaller of the amount of data to be
 +	 * transferred, or 512 bytes.  The 512 size is fixed by the standard;
 +	 * smaller blocks are possible, but never larger.
 +	 */
 +
 +	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); 
 +
 +	mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; 
 +	mr |=  min(data->len, 512) << 16; 
 +	WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE|MCI_MR_PDCPADV);
 +
 +	/* Set up DMA.
 +	 *
 +	 * Use bounce buffers even if we don't need to byteswap, because doing
 +	 * multi-block IO with large DMA buffers is way fast (compared to
 +	 * single-block IO), even after incurring the overhead of also copying
 +	 * from/to the caller's buffers (which may be in non-contiguous physical
 +	 * pages).
 +	 *
 +	 * In an ideal non-byteswap world we could create a dma tag that allows
 +	 * for discontiguous segments and do the IO directly from/to the
 +	 * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the
 +	 * discontiguous buffers through the PDC. Someday.
 +	 *
 +         * If a read is bigger than 2k, split it in half so that we can start
 +	 * byte-swapping the first half while the second half is on the wire.
 +	 * It would be best if we could split it into 8k chunks, but we can't
 +	 * always keep up with the byte-swapping due to other system activity,
 +	 * and if an RXBUFF interrupt happens while we're still handling the
 +	 * byte-swap from the prior buffer (IE, we haven't returned from
 +	 * handling the prior interrupt yet), then data will get dropped on the
 +	 * floor and we can't easily recover from that.  The right fix for that
 +	 * would be to have the interrupt handling only keep the DMA flowing and
 +	 * enqueue filled buffers to be byte-swapped in a non-interrupt context.
 +	 * Even that won't work on the write side of things though; in that
 +	 * context we have to have all the data ready to go before starting the
 +	 * dma.
 +	 *
 +	 * XXX what about stream transfers?
 +	 */
 +
 +	sc->xfer_offset = 0;
 +	sc->bbuf_curidx = 0;
 +
 +	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) {
 +		uint32_t len;
 +		uint32_t remaining = data->len;
 +		bus_addr_t paddr;
 +		int err;
 +
 +		if (remaining > (BBCOUNT*BBSIZE))
 +			panic("IO read size exceeds MAXDATA\n");
 +
 +		if (data->flags & MMC_DATA_READ) {
 +			if (remaining > 2048)
 +				len = remaining / 2;
 +			else
 +				len = remaining;
 +			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
 +			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
 +			    &paddr, BUS_DMA_NOWAIT);
 +			if (err != 0)
 +				panic("IO read dmamap_load failed\n");
 +			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
 +			    BUS_DMASYNC_PREREAD);
 +			WR4(sc, PDC_RPR, paddr);
 +			WR4(sc, PDC_RCR, len / 4);
 +			sc->bbuf_len[0] = len;
 +			remaining -= len;
 +			if (remaining == 0) {
 +				sc->bbuf_len[1] = 0;
 +			} else {
 +				len = remaining;
 +				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
 +				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
 +				    &paddr, BUS_DMA_NOWAIT);
 +				if (err != 0)
 +					panic("IO read dmamap_load failed\n");
 +				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
 +				    BUS_DMASYNC_PREREAD);
 +				WR4(sc, PDC_RNPR, paddr);
 +				WR4(sc, PDC_RNCR, len / 4);
 +				sc->bbuf_len[1] = len;
 +				remaining -= len;
 +			}
  			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
 -			WR4(sc, MCI_CMDR, cmdr);
  		} else {
 -			WR4(sc, MCI_CMDR, cmdr);
 -			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
 +			len = min(BBSIZE, remaining);
 +			at91_bswap_buf(sc, sc->bbuf_vaddr[0], data->data, len);
 +			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
 +			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
 +			    &paddr, BUS_DMA_NOWAIT);
 +			if (err != 0)
 +				panic("IO write dmamap_load failed\n");
 +			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
 +			    BUS_DMASYNC_PREWRITE);
 +			WR4(sc, PDC_TPR,paddr);
 +			WR4(sc, PDC_TCR, len / 4);
 +			sc->bbuf_len[0] = len;
 +			remaining -= len;
 +			if (remaining == 0) {
 +				sc->bbuf_len[1] = 0;
 +			} else {
 +				len = remaining;
 +				at91_bswap_buf(sc, sc->bbuf_vaddr[1],
 +				    ((char *)data->data)+BBSIZE, len);
 +				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
 +				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
 +				    &paddr, BUS_DMA_NOWAIT);
 +				if (err != 0)
 +					panic("IO write dmamap_load failed\n");
 +				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
 +				    BUS_DMASYNC_PREWRITE);
 +				WR4(sc, PDC_TNPR, paddr);
 +				WR4(sc, PDC_TNCR, len / 4);
 +				sc->bbuf_len[1] = len;
 +				remaining -= len;
 +			}
 +			/* do not enable PDC xfer until CMDRDY asserted */
  		}
 +		data->xfer_len = 0; /* XXX what's this? appears to be unused. */
  	}
 -	WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
 +
 +	if (mci_debug)
 +		printf("CMDR %x (opcode %d) ARGR %x with data len %d\n", 
 +		       cmdr, cmd->opcode, cmd->arg, cmd->data->len);
 +
 +	WR4(sc, MCI_ARGR, cmd->arg);
 +	WR4(sc, MCI_CMDR, cmdr);
 +	WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
  }
  
  static void
 -at91_mci_start(struct at91_mci_softc *sc)
 +at91_mci_next_operation(struct at91_mci_softc *sc)
  {
  	struct mmc_request *req;
  
  	req = sc->req;
  	if (req == NULL)
  		return;
 -	// assert locked
 -	if (!(sc->flags & CMD_STARTED)) {
 -		sc->flags |= CMD_STARTED;
 -//		printf("Starting CMD\n");
 +
 +	if (sc->flags & PENDING_CMD) {
 +		sc->flags &= ~PENDING_CMD;
  		at91_mci_start_cmd(sc, req->cmd);
  		return;
 -	}
 -	if (!(sc->flags & STOP_STARTED) && req->stop) {
 -//		printf("Starting Stop\n");
 -		sc->flags |= STOP_STARTED;
 +	} else if (sc->flags & PENDING_STOP) {
 +		sc->flags &= ~PENDING_STOP;
  		at91_mci_start_cmd(sc, req->stop);
  		return;
  	}
 -	/* We must be done -- bad idea to do this while locked? */
 +
 +	WR4(sc, MCI_IDR, 0xffffffff);
  	sc->req = NULL;
  	sc->curcmd = NULL;
 +	//printf("req done\n");
  	req->done(req);
  }
  
 @@ -486,16 +809,16 @@
  	struct at91_mci_softc *sc = device_get_softc(brdev);
  
  	AT91_MCI_LOCK(sc);
 -	// XXX do we want to be able to queue up multiple commands?
 -	// XXX sounds like a good idea, but all protocols are sync, so
 -	// XXX maybe the idea is naive...
  	if (sc->req != NULL) {
  		AT91_MCI_UNLOCK(sc);
  		return (EBUSY);
  	}
 +	//printf("new req\n");
  	sc->req = req;
 -	sc->flags = 0;
 -	at91_mci_start(sc);
 +	sc->flags = PENDING_CMD;
 +	if (sc->req->stop)
 +		sc->flags |= PENDING_STOP;
 +	at91_mci_next_operation(sc);
  	AT91_MCI_UNLOCK(sc);
  	return (0);
  }
 @@ -533,120 +856,341 @@
  }
  
  static void
 -at91_mci_read_done(struct at91_mci_softc *sc)
 +at91_mci_read_done(struct at91_mci_softc *sc, uint32_t sr)
  {
 -	uint32_t *walker;
 -	struct mmc_command *cmd;
 -	int i, len;
 -
 -	cmd = sc->curcmd;
 -	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD);
 -	bus_dmamap_unload(sc->dmatag, sc->map);
 -	sc->mapped--;
 -	if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
 -		walker = (uint32_t *)cmd->data->data;
 -		len = cmd->data->len / 4;
 -		for (i = 0; i < len; i++)
 -			walker[i] = bswap32(walker[i]);
 -	}
 -	// Finish up the sequence...
 -	WR4(sc, MCI_IDR, MCI_SR_ENDRX);
 -	WR4(sc, MCI_IER, MCI_SR_RXBUFF);
 -	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 +	struct mmc_command *cmd = sc->curcmd;
 +	char * dataptr = (char *)cmd->data->data;
 +	uint32_t curidx = sc->bbuf_curidx;
 +	uint32_t len = sc->bbuf_len[curidx];
 +
 +	/* We arrive here when a DMA transfer for a read is done, whether it's a
 +	 * single or multi-block read.
 +	 *
 +	 * We byte-swap the buffer that just completed, and if that is the last
 +	 * buffer that's part of this read then we move on to the next
 +	 * operation, otherwise we wait for another ENDRX for the next bufer.
 +	 */
 +
 +	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[curidx], BUS_DMASYNC_POSTREAD);
 +	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[curidx]);
 +
 +	at91_bswap_buf(sc, dataptr + sc->xfer_offset, sc->bbuf_vaddr[curidx], len);
 +
 +	if (mci_debug) {
 +		printf("read done sr %x curidx %d len %d xfer_offset %d\n",
 +		       sr, curidx, len, sc->xfer_offset);
 +	}
 +
 +	sc->xfer_offset += len;
 +	sc->bbuf_curidx = !curidx; /* swap buffers */
 +
 +	/* If we've transferred all the data, move on to the next operation.
 +	 *
 +	 * If we're still transferring the last buffer, RNCR is already zero but
 +	 * we have to write a zero anyway to clear the ENDRX status so we don't
 +	 * re-interrupt until the last buffer is done.
 +	 */
 +
 +	if (sc->xfer_offset == cmd->data->len) {
 +		WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 +		cmd->error = MMC_ERR_NONE;
 +		at91_mci_next_operation(sc);
 +	} else {
 +		WR4(sc, PDC_RNCR, 0);
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_ENDRX);
 +	}
  }
  
  static void
 -at91_mci_xmit_done(struct at91_mci_softc *sc)
 +at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr)
  {
 -	// Finish up the sequence...
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/* We arrive here when the entire DMA transfer for a write is done,
 +	 * whether it's a single or multi-block write.  If it's multi-block we
 +	 * have to immediately move on to the next operation which is to send
 +	 * the stop command.  If it's a single-block transfer we need to wait
 +	 * for NOTBUSY, but if that's already asserted we can avoid another
 +	 * interrupt and just move on to completing the request right away.
 +	 */
 +
  	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 -	WR4(sc, MCI_IDR, MCI_SR_TXBUFE);
 -	WR4(sc, MCI_IER, MCI_SR_NOTBUSY);
 -	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE);
 -	bus_dmamap_unload(sc->dmatag, sc->map);
 -	sc->mapped--;
 +
 +	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx], BUS_DMASYNC_POSTWRITE);
 +	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx]);
 +
 +	if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) {
 +                cmd->error = MMC_ERR_NONE;
 +                at91_mci_next_operation(sc);
 +	} else {
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +	}
 +}
 +
 +static void
 +at91_mci_notbusy(struct at91_mci_softc *sc)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/* We arrive here by either completion of a single-block write, or
 +	 * completion of the stop command that ended a multi-block write (and, I
 +	 * suppose, after a card-select or erase, but I haven't tested those).
 +	 * Anyway, we're done and it's time to move on to the next command.
 +	 */
 +
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
 +}
 +
 +static void
 +at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/* We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION
 +	 * command.  Depending on the operation being stopped, we may have to do
 +	 * some unusual things to work around hardware bugs.
 +	 */
 +
 +	/* This is known to be true of at91rm9200 hardware; it may or may not
 +	 * apply to more recent chips: 
 +	 *
 +	 * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does
 +	 * not properly reflect the actual busy state of the card as signaled on
 +	 * the DAT0 line; it always claims the card is not-busy.  If we believe
 +	 * that and let operations continue, following commands will fail with
 +	 * response timeouts (except of course MMC_SEND_STATUS -- it indicates
 +	 * the card is busy in the PRG state, which was the smoking gun that
 +	 * showed MCI_SR NOTBUSY was not tracking DAT0 correctly).
 +	 *
 +	 * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only
 +	 * for Write Operations."  I guess technically since we sent a stop it's
 +	 * not a write operation anymore.  But then just what did they think it
 +	 * meant for the stop command to have "...an optional busy signal
 +	 * transmitted on the data line" according to the SD spec?
 +	 *
 +	 * I tried a variety of things to un-wedge the MCI and get the status
 +	 * register to reflect NOTBUSY correctly again, but the only thing that
 +	 * worked was a full device reset.  It feels like an awfully big hammer,
 +	 * but doing a full reset after every multiblock write is still faster
 +	 * than doing single-block IO (by almost two orders of magnitude:
 +	 * 20KB/sec improves to about 1.8MB/sec best case).
 +	 *
 +	 * After doing the reset, wait for a NOTBUSY interrupt before continuing
 +	 * with the next operation.
 +	 */
 +
 +	if (sc->flags & CMD_MULTIWRITE) {
 +		at91_mci_reset(sc);
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +		return;
 +	}
 +
 +	/* This is known to be true of at91rm9200 hardware; it may or may not
 +	 * apply to more recent chips: 
 +	 *
 +	 * After stopping a multi-block read, loop to read and discard any data
 +	 * that coasts in after we sent the stop command.  The docs don't say
 +	 * anything about it, but empirical testing shows that 1-3 additional
 +	 * words of data get buffered up in some unmentioned internal fifo and
 +	 * if we don't read and discard them here they end up on the front of
 +	 * the next read DMA transfer we do.
 +	 */
 +
 +	if (sc->flags & CMD_MULTIREAD) {
 +		uint32_t sr;
 +		int count = 0;
 +		do {
 +			sr = RD4(sc, MCI_SR);
 +			if (sr & MCI_SR_RXRDY) {
 +				RD4(sc,  MCI_RDR);
 +				++count;
 +			}
 +		} while (sr & MCI_SR_RXRDY);
 +		at91_mci_reset(sc);
 +//              if (count != 0)
 +//                      printf("Had to soak up %d words after read\n", count);
 +	}
 +
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
 +
 +}
 +
 +static void
 +at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +	int i;
 +
 +	if (cmd == NULL)
 +		return;
 +
 +	/* We get here at the end of EVERY command.  We retrieve the command
 +	 * response (if any) then decide what to do next based on the command.
 +	 */
 +
 +	if (cmd->flags & MMC_RSP_PRESENT) {
 +		for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) {
 +			cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
 +			if (mci_debug)
 +				printf("RSPR[%d] = %x sr=%x\n", i, cmd->resp[i],  sr);
 +		}
 +	}
 +
 +	/* If this was a stop command, go handle the various special
 +	 * conditions (read: bugs) that have to be dealt with following a stop.
 +	 */
 +
 +	if (cmd->opcode == MMC_STOP_TRANSMISSION) {
 +		at91_mci_stop_done(sc, sr);
 +		return;
 +	}
 +
 +	/* If this command can continue to assert BUSY beyond the response then
 +	 * we need to wait for NOTBUSY before the command is really done.
 +	 *
 +	 * Note that this may not work properly on the at91rm9200.  It certainly
 +	 * doesn't work for the STOP command that follows a multi-block write,
 +	 * so post-stop CMDRDY is handled separately; see the special handling
 +	 * in at91_mci_stop_done().
 +	 *
 +	 * Beside STOP, there are other R1B-type commands that use the busy
 +	 * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect),
 +	 * CMD38 (erase). I haven't tested any of them, but I rather expect
 +	 * them all to have the same sort of problem with MCI_SR not actually
 +	 * reflecting the state of the DAT0-line busy indicator.  So this code
 +	 * may need to grow some sort of special handling for them too. (This
 +	 * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly
 +	 * sets the response flags to R1 rather than R1B.)
 +	 */
 +
 +	if ((cmd->flags & MMC_RSP_BUSY)) {
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +		return;
 +	}
 +
 +	/* If there is a data transfer with this command, then...
 +	 * - If it's a read, we need to wait for ENDRX.
 +	 * - If it's a write, now is the time to enable the PDC, and we need to
 +	 *   wait for a BLKE that follows a TXBUFE, because if we're doing a
 +	 *   split transfer we get a BLKE after the first half (when TPR/TCR get
 +	 *   loaded from TNPR/TNCR).  So first we wait for the TXBUFE, and the
 +	 *   handling for that interrupt will then invoke the wait for the
 +	 *   subsequent BLKE which indicates actual completion.
 +	 */
 +
 +	if (cmd->data) {
 +		uint32_t ier;
 +		if (cmd->data->flags & MMC_DATA_READ) {
 +			ier = MCI_SR_ENDRX;
 +		} else {
 +			ier = MCI_SR_TXBUFE;
 +			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
 +		}
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
 +		return;
 +	}
 +
 +	/* If we made it to here, we don't need to wait for anything more for
 +	 * the current command, move on to the next command (will complete the
 +	 * request if there is no next command).
 +	 */
 +
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
  }
  
  static void
  at91_mci_intr(void *arg)
  {
  	struct at91_mci_softc *sc = (struct at91_mci_softc*)arg;
 -	uint32_t sr;
 -	int i, done = 0;
 -	struct mmc_command *cmd;
 +	struct mmc_command *cmd = sc->curcmd;
 +	uint32_t sr, isr;
  
  	AT91_MCI_LOCK(sc);
 -	sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR);
 -//	printf("i 0x%x\n", sr);
 -	cmd = sc->curcmd;
 -	if (sr & MCI_SR_ERROR) {
 -		// Ignore CRC errors on CMD2 and ACMD47, per relevant standards
 -		if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND ||
 -		    cmd->opcode == ACMD_SD_SEND_OP_COND))
 -			cmd->error = MMC_ERR_NONE;
 -		else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE))
 +
 +	sr = RD4(sc, MCI_SR);
 +	isr = sr & RD4(sc, MCI_IMR);
 +
 +	if (mci_debug)
 +		printf("i 0x%x sr 0x%x\n", isr, sr);
 +
 +	/* All interrupts are one-shot; disable it now.
 +	 * The next operation will re-enable whatever interrupts it wants.
 +	 */
 +
 +	WR4(sc, MCI_IDR, isr);
 +
 +	if (isr & MCI_SR_ERROR) {
 +		if (isr & (MCI_SR_RTOE | MCI_SR_DTOE))
  			cmd->error = MMC_ERR_TIMEOUT;
 -		else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
 +		else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
  			cmd->error = MMC_ERR_BADCRC;
 -		else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE))
 +		else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE))
  			cmd->error = MMC_ERR_FIFO;
  		else
  			cmd->error = MMC_ERR_FAILED;
 -		done = 1;
 -		if (sc->mapped && cmd->error) {
 -			bus_dmamap_unload(sc->dmatag, sc->map);
 -			sc->mapped--;
 +		/* CMD8 is used to probe for SDHC cards, a standard SD card will
 +		 * get a response timeout; don't report it because it's a normal
 +		 * and expected condition.  One might argue that all error
 +		 * reporting should be left to higher levels, but when they
 +		 * report at all it's always EIO, which isn't very helpful.
 +		 */
 +		if (cmd->opcode != 8) {
 +			device_printf(sc->dev, 
 +			    "IO error; status MCI_SR = 0x%x cmd opcode = %d%s\n",  
 +			    sr, cmd->opcode,
 +			    (cmd->opcode != 12) ? "" : 
 +			    (sc->flags & CMD_MULTIREAD) ? " after read" : " after write");
 +			at91_mci_reset(sc);
  		}
 +		at91_mci_next_operation(sc);
  	} else {
 -		if (sr & MCI_SR_TXBUFE) {
 +		if (isr & MCI_SR_TXBUFE) {
  //			printf("TXBUFE\n");
 -			at91_mci_xmit_done(sc);
 +			/* We need to wait for a BLKE that follows TXBUFE
 +			 * (intermediate BLKEs might happen after ENDTXes if
 +			 * we're chaining multiple buffers).  If BLKE is also
 +			 * asserted at the time we get TXBUFE, we can avoid
 +			 * another interrupt and process it right away, below.
 +			 */
 +			if (sr & MCI_SR_BLKE)
 +				isr |= MCI_SR_BLKE;
 +			else
 +				WR4(sc, MCI_IER, MCI_SR_BLKE);
  		}
 -		if (sr & MCI_SR_RXBUFF) {
 +		if (isr & MCI_SR_RXBUFF) {
  //			printf("RXBUFF\n");
 -			WR4(sc, MCI_IDR, MCI_SR_RXBUFF);
 -			WR4(sc, MCI_IER, MCI_SR_CMDRDY);
  		}
 -		if (sr & MCI_SR_ENDTX) {
 +		if (isr & MCI_SR_ENDTX) {
  //			printf("ENDTX\n");
  		}
 -		if (sr & MCI_SR_ENDRX) {
 +		if (isr & MCI_SR_ENDRX) {
  //			printf("ENDRX\n");
 -			at91_mci_read_done(sc);
 +			at91_mci_read_done(sc, sr);
  		}
 -		if (sr & MCI_SR_NOTBUSY) {
 +		if (isr & MCI_SR_NOTBUSY) {
  //			printf("NOTBUSY\n");
 -			WR4(sc, MCI_IDR, MCI_SR_NOTBUSY);
 -			WR4(sc, MCI_IER, MCI_SR_CMDRDY);
 +			at91_mci_notbusy(sc);
  		}
 -		if (sr & MCI_SR_DTIP) {
 +		if (isr & MCI_SR_DTIP) {
  //			printf("Data transfer in progress\n");
  		}
 -		if (sr & MCI_SR_BLKE) {
 +		if (isr & MCI_SR_BLKE) {
  //			printf("Block transfer end\n");
 +			at91_mci_write_done(sc, sr);
  		}
 -		if (sr & MCI_SR_TXRDY) {
 +		if (isr & MCI_SR_TXRDY) {
  //			printf("Ready to transmit\n");
  		}
 -		if (sr & MCI_SR_RXRDY) {
 +		if (isr & MCI_SR_RXRDY) {
  //			printf("Ready to receive\n");
  		}
 -		if (sr & MCI_SR_CMDRDY) {
 +		if (isr & MCI_SR_CMDRDY) {
  //			printf("Command ready\n");
 -			done = 1;
 -			cmd->error = MMC_ERR_NONE;
 -		}
 -	}
 -	if (done) {
 -		WR4(sc, MCI_IDR, 0xffffffff);
 -		if (cmd != NULL && (cmd->flags & MMC_RSP_PRESENT)) {
 -			for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1);
 -			     i++) {
 -				cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
 -//				printf("RSPR[%d] = %x\n", i, cmd->resp[i]);
 -			}
 +			at91_mci_cmdrdy(sc, sr);
  		}
 -		at91_mci_start(sc);
  	}
  	AT91_MCI_UNLOCK(sc);
  }
 @@ -703,7 +1247,7 @@
  		*(int *)result = sc->host.caps;
  		break;
  	case MMCBR_IVAR_MAX_DATA:
 -		*(int *)result = 1;
 +		*(int *)result = MAX_BLOCKS;
  		break;
  	}
  	return (0);
 
 


More information about the freebsd-arm mailing list