mm, proc: fix region lost in /proc/self/smaps
authorRobert Ho <robert.hu@intel.com>
Sat, 8 Oct 2016 00:02:36 +0000 (17:02 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Oct 2016 01:46:30 +0000 (18:46 -0700)
commit855af072b6c40aeb266f4dc98fd9a6a49edf22af
tree23c097da09171f9290450ccf30a68606f86e7db5
parent4b2bd5fec007a4fd3fc82474b9199af25013de4c
mm, proc: fix region lost in /proc/self/smaps

Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
more detailed info please refer to:

   https://bugzilla.redhat.com/show_bug.cgi?id=1365721

Actually, this bug is not only for NVDIMM/DAX but also for any other
file systems.  This simple test case abstracted from nvml can easily
reproduce this bug in common environment:

-------------------------- testcase.c -----------------------------

int
is_pmem_proc(const void *addr, size_t len)
{
        const char *caddr = addr;

        FILE *fp;
        if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                printf("!/proc/self/smaps");
                return 0;
        }

        int retval = 0;         /* assume false until proven otherwise */
        char line[PROCMAXLEN];  /* for fgets() */
        char *lo = NULL;        /* beginning of current range in smaps file */
        char *hi = NULL;        /* end of current range in smaps file */
        int needmm = 0;         /* looking for mm flag for current range */
        while (fgets(line, PROCMAXLEN, fp) != NULL) {
                static const char vmflags[] = "VmFlags:";
                static const char mm[] = " wr";

                /* check for range line */
                if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                        if (needmm) {
                                /* last range matched, but no mm flag found */
                                printf("never found mm flag.\n");
                                break;
                        } else if (caddr < lo) {
                                /* never found the range for caddr */
                                printf("#######no match for addr %p.\n", caddr);
                                break;
                        } else if (caddr < hi) {
                                /* start address is in this range */
                                size_t rangelen = (size_t)(hi - caddr);

                                /* remember that matching has started */
                                needmm = 1;

                                /* calculate remaining range to search for */
                                if (len > rangelen) {
                                        len -= rangelen;
                                        caddr += rangelen;
                                        printf("matched %zu bytes in range "
                                                "%p-%p, %zu left over.\n",
                                                        rangelen, lo, hi, len);
                                } else {
                                        len = 0;
                                        printf("matched all bytes in range "
                                                        "%p-%p.\n", lo, hi);
                                }
                        }
                } else if (needmm && strncmp(line, vmflags,
                                        sizeof(vmflags) - 1) == 0) {
                        if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                printf("mm flag found.\n");
                                if (len == 0) {
                                        /* entire range matched */
                                        retval = 1;
                                        break;
                                }
                                needmm = 0;     /* saw what was needed */
                        } else {
                                /* mm flag not set for some or all of range */
                                printf("range has no mm flag.\n");
                                break;
                        }
                }
        }

        fclose(fp);

        printf("returning %d.\n", retval);
        return retval;
}

void *Addr;
size_t Size;

/*
 * worker -- the work each thread performs
 */
static void *
worker(void *arg)
{
        int *ret = (int *)arg;
        *ret =  is_pmem_proc(Addr, Size);
        return NULL;
}

int main(int argc, char *argv[])
{
        if (argc <  2 || argc > 3) {
                printf("usage: %s file [env].\n", argv[0]);
                return -1;
        }

        int fd = open(argv[1], O_RDWR);

        struct stat stbuf;
        fstat(fd, &stbuf);

        Size = stbuf.st_size;
        Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);

        close(fd);

        pthread_t threads[NTHREAD];
        int ret[NTHREAD];

        /* kick off NTHREAD threads */
        for (int i = 0; i < NTHREAD; i++)
                pthread_create(&threads[i], NULL, worker, &ret[i]);

        /* wait for all the threads to complete */
        for (int i = 0; i < NTHREAD; i++)
                pthread_join(threads[i], NULL);

        /* verify that all the threads return the same value */
        for (int i = 1; i < NTHREAD; i++) {
                if (ret[0] != ret[i]) {
                        printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                ret[0], ret[i]);
                }
        }

        printf("%d", ret[0]);
        return 0;
}

It failed as some threads can not find the memory region in
"/proc/self/smaps" which is allocated in the main process

It is caused by proc fs which uses 'file->version' to indicate the VMA that
is the last one has already been handled by read() system call. When the
next read() issues, it uses the 'version' to find the VMA, then the next
VMA is what we want to handle, the related code is as follows:

        if (last_addr) {
                vma = find_vma(mm, last_addr);
                if (vma && (vma = m_next_vma(priv, vma)))
                        return vma;
        }

However, VMA will be lost if the last VMA is gone, e.g:

The process VMA list is A->B->C->D

CPU 0                                  CPU 1
read() system call
   handle VMA B
   version = B
return to userspace

                                   unmap VMA B

issue read() again to continue to get
the region info
   find_vma(version) will get VMA C
   m_next_vma(C) will get VMA D
   handle D
   !!! VMA C is lost !!!

In order to fix this bug, we make 'file->version' indicate the end address
of the current VMA.  m_start will then look up a vma which with vma_start
< last_vm_end and moves on to the next vma if we found the same or an
overlapping vma.  This will guarantee that we will not miss an exclusive
vma but we can still miss one if the previous vma was shrunk.  This is
acceptable because guaranteeing "never miss a vma" is simply not feasible.
User has to cope with some inconsistencies if the file is not read in one
go.

[mhocko@suse.com: changelog fixes]
Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.com
Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Robert Hu <robert.hu@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/proc/task_mmu.c