from CSE142L.notebook import *
from notebook import *
# if you get something about NUMEXPR_MAX_THREADS being set incorrectly, don't worry.  It's not a problem.

  0%|          | 0/1 [00:00<?, ?it/s]

void working(uint64_t size) {
    auto s = new std::set<uint64_t>();
    uint64_t seed = 1;
    uint64_t sum = 0;

    for(uint x = 0; x < size; x++) {
        auto t = fast_rand(&seed);
        s->insert(t);
    }
    
    seed = 1;
    start_measurement();
    for(uint x = 0; x < size; x++) {
        auto a = s->find(fast_rand(&seed));
        sum += *a;
    }
    end_measurement();

    delete s;
}

  0%|          | 0/1 [00:00<?, ?it/s]

#include"cfiddle.hpp"
#include"tensor_t.hpp"
#include"util.hpp"
#include<cstdint>

extern "C"
void x_inside(uint64_t size, uint64_t rows) {
    tensor_t<uint32_t> t(size/rows,rows,1,1);
    disable_prefetcher();
    flush_caches();
    start_measurement();
    for(uint y = 0; y < rows; y++) {
        for(uint x = 0; x < size/rows; x++) {
            t.get(x,y,0,0) = x;
        }
    }
    end_measurement();
    
}


extern "C"
void x_outside(uint64_t size, uint64_t rows) {
    tensor_t<uint32_t> t(size/rows,rows,1,1);
    disable_prefetcher();
    flush_caches();
    start_measurement();
    for(uint x = 0; x < size/rows; x++) {
        for(uint y = 0; y < rows; y++) {
            t.get(x,y,0,0) = x;
        }
    }
    end_measurement();
    
}


// Cfiddle-signature=c92a3c7647aabca719020bc299f5a6a8

#include <stdlib.h>
#include <iostream>
#include <set>
#include <cstring>
#include "cfiddle.hpp"
#include"ChunkAlloc.hpp"

template<
	class T,          // This is the type we are allocating.  
		              // You can assume this is less than or equal to 4kB
	size_t ALIGNMENT  // The alignment at which we much allocate the objects.  
	                  // You can assume this is less than or equal to 4kB
	> 
class ReferenceAllocator {
	std::set<T*> chunks; // We store everything we allocated so we can clean up in the destructor.
public:
 	typedef T ItemType; // This will make T available as ReferenceAllocator::ItemType 
	static const size_t Alignment = ALIGNMENT;  // Likewise, we can access the alignment as 
	                                            // ReferenceAllocator::Alignment
	
	ReferenceAllocator() {}
	
        T * alloc() {
		void* p  = NULL;
		// this system call can allocate aribitrary-sized and aligned
		// objects.  Since it can handle any size, it's more general.
		int r =  posix_memalign(&p, ALIGNMENT, sizeof(T));
		if (r != 0) { 
			std::cerr << "posix_memalign() failed.  Exiting: " << strerror(r) << "(" << r << ")\n";
			exit(1);
		}
		
		// alloc_chunk provides void*, but we can assign to void.  So cast...
		uint8_t * t = reinterpret_cast<uint8_t*>(p);
		for(uint i= 0; i < sizeof(T); i++) {
			t[i] = 0; // and set to zero.
		}
		T* c = reinterpret_cast<T*>(p); // cast to the type we allocate.
		new (c) T; // This is the "in place" new operator.  
		           // It constructs an object at a given location.
		chunks.insert(c); // record it so we can delete it later.
		return c;
	}
	
	void free(T * p) {
		std::free(reinterpret_cast<void*>(p)); // Return the memory
		chunks.erase(p); // note that it's no longer allocated.
	}

	~ReferenceAllocator() {
		for(auto & p: chunks) { 
			std::free(reinterpret_cast<void*>(p)); // Return everything that still allocated.
		}
	}
};

template<class T, size_t ALIGNMENT> 
const size_t ReferenceAllocator<T, ALIGNMENT>::Alignment;

#define CHUNK_SIZE (128*1024)

extern void init_chunk(); // Set things up.
extern void * alloc_chunk(); // Allocate CHUNK_SIZE of memory
extern void free_chunk(void*p); // Free CHUNK_SIZE of memory
extern size_t get_allocated_chunks(); // Query how many chunks are currently allocated.

#include <stdlib.h>
#include<iostream>
#include <string.h>
#include "ChunkAlloc.hpp"
#include <sys/mman.h>

static size_t allocated_chunks = 0;

void init_chunk() {
	// This used to do stuff for Moneta. Now it's empty.
}

void * alloc_chunk() { // allocate CHUNK_SIZE bytes of memory by asking the operating system for it.
	
	// this is how malloc gets it's memory from the kernel.
	// mmap() can do many things.  In this case, it just asks the kernel to
	// give us some pages of memory.  They are guaranteed to contain zeros.
	void * r = mmap(NULL, CHUNK_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, 0, 0); 
	if (r == MAP_FAILED) { 
		std::cerr << "alloc_chunk() failed. This often means you've allocated too many chunks. Exiting: " << strerror(errno) << "\n";
		exit(1);
	}
	allocated_chunks++; // This is just statistics tracking
	return r;
}

void free_chunk(void*p) { // Return the chunk to the OS.  After this, accesses to the addresses in the chunk will result in SEGFAULT
	int r = munmap(p, CHUNK_SIZE);
	if (r != 0) {
		std::cerr << "free_chunk() failed. exiting: " << strerror(errno) << "\n";
		exit(1);
	}
	allocated_chunks--;
}

size_t get_allocated_chunks() {
	return allocated_chunks;
}

#include <stdlib.h>
#include<iostream>
#include"cfiddle.hpp"
#include<map>
#include<vector>
#include<algorithm>
#include"ReferenceAllocator.hpp"

#if USE_INSTRUCTOR_SOLUTION == 1
# include"admin/FastAllocator.hpp"
#else
# if USE_INSTRUCTOR_SOLUTION == 2
#  define ReferenceAllocator AlignedAllocator // This is terrible. I'm sorry.
#  include"ReferenceAllocator.hpp"
#  undef ReferenceAllocator
# else
# include"AlignedAllocator.hpp"
# endif
#endif

template<class Allocator>
void exercise(Allocator * allocator, size_t count, int iterations, uint64_t seed, bool cleanup = false) {
	// Interesting allocator behaviors an bugs emerge when the allocator
	// has to allocate and free objects in complex patterns.
	//
	// To simulate that, we allocate count items and then, on each
	// iteration, free about 1/4 of them and replace them with new items.
	
	std::vector<typename Allocator::ItemType *> items(count);
	
	for(unsigned int i = 0; i < count; i++)
		items[i] = NULL;

	for(int i = 0; i < iterations; i++) {
		for(unsigned int j = 0; j < count; j++) {
			if (items[j] == NULL) {
				items[j] = allocator->alloc();
			}
		}
		for(unsigned int j = 0; j < count; j++) {
			fast_rand(&seed);
			if (seed & 0x3) {
				allocator->free(items[j]);
				items[j] = NULL;
			}
		}
	}
	if (cleanup) {
		for(unsigned int j = 0; j < count; j++) {
			if (items[j]) {
				allocator->free(items[j]);
				items[j] = NULL;
			}
		}
	}

}


template<class Allocator>
void bench(uint64_t count, uint64_t seed, bool do_exercise, const char * tag) {
	auto alloc = new Allocator;
	if (do_exercise){ // warm it up.
		exercise<Allocator>(alloc, 4000, 20, seed);
	}
	start_measurement(tag);
	exercise<Allocator>(alloc, count/16, 16, seed);
	end_measurement();
	delete alloc;

}


template<class Allocator>
void microbench(uint64_t count, uint64_t seed, bool do_exercise, const char * alloc_tag, const char * free_tag) {

	auto alloc = new Allocator;
	
	if (do_exercise) { // get the allocator warmed up.
		exercise<Allocator>(alloc, 4000, 20, seed);
	}
	std::vector<typename Allocator::ItemType*> items(count);
	start_measurement(alloc_tag);
	for(uint64_t i = 0; i < count; i++) {
		items[i] = alloc->alloc();
	}
	end_measurement();
		
	if (do_exercise) {
		exercise<Allocator>(alloc, 4000, 20, seed);
	}
	start_measurement(free_tag);
	for(uint64_t i = 0; i < count; i++) {
		alloc->free(items[i]);
	}
	end_measurement();
	
	delete alloc;

}


//BEGIN
struct MissingLink {
	struct MissingLink * next;
};

extern "C"
struct MissingLink*  __attribute__((noinline)) do_misses(struct MissingLink * l, uint64_t access_count) {
	for(uint i = 0; i < access_count; i++) {
		l = l->next;
	}
	return l;
}

template<class Allocator>
void  miss_machine(uint64_t link_count, uint64_t access_count, uint64_t seed, const char * tag) {
	auto alloc = new Allocator; // create the allocator.

	exercise<Allocator>(alloc, 10000, 20, seed); // warm it up.

	std::vector<struct MissingLink *> links(link_count);  // Storage for the links
	for(auto &i : links) {                           // allocate them.
		i = alloc->alloc();
		i->next = NULL;
	}
	
	std::shuffle(links.begin(), links.end(), fast_URBG(seed));  // randomize the order of the links
	for(uint i = 0; i < links.size() -1; i++) { 
		links[i]->next = links[i+1];           // Make the next pointers reflect the ordering.
	}
	links.back()->next = links.front(); // complete the circle

	struct MissingLink * l = links[0];

	start_measurement(tag);
	l = do_misses(l, access_count); // Do the misses.
	end_measurement();

	delete alloc;

}
//END


// Call the starter code

extern "C"
void allocator_bench_starter(uint64_t count, uint64_t seed) {
	bench<ReferenceAllocator<uint8_t[3], 16>>(count, seed, true,      "bench-3-bytes"   );
	bench<ReferenceAllocator<uint8_t[125], 32>>(count, seed, true,    "bench-125-bytes" );
	bench<ReferenceAllocator<uint8_t[4096], 4096>>(count, seed, true, "bench-4096-bytes");
}


extern "C"
void allocator_microbench_starter(uint64_t count, uint64_t seed) {
	microbench<ReferenceAllocator<uint[4], 8>>(count, seed, true,       "alloc-4-bytes",    "free-4-bytes");
	microbench<ReferenceAllocator<uint[1024], 4096>>(count, seed, true, "alloc-1024-bytes", "free-1024-bytes");
}


extern "C"
void miss_machine_starter(uint64_t link_count, uint64_t access_count, uint64_t seed) {
	miss_machine<ReferenceAllocator<struct MissingLink, sizeof(struct MissingLink)> >(link_count, access_count, seed, "miss-machine");
}


// Call your code

extern "C"
void allocator_bench_solution(uint64_t count, uint64_t seed) {
	bench<AlignedAllocator<uint8_t[3], 16>>(count, seed, true,      "bench-3-bytes"   ); 
	bench<AlignedAllocator<uint8_t[125], 32>>(count, seed, true,    "bench-125-bytes" );
	bench<AlignedAllocator<uint8_t[4096], 4096>>(count, seed, true, "bench-4096-bytes");
}

extern "C"
void allocator_microbench_solution(uint64_t count, uint64_t seed) {
	microbench<AlignedAllocator<uint[4], 8>>(count, seed, true,       "alloc-32-bytes",    "free-32-bytes");
	microbench<AlignedAllocator<uint[256], 4096>>(count, seed, true, "alloc-1024-bytes", "free-1024-bytes");
}

extern "C"
void miss_machine_solution(uint64_t link_count, uint64_t access_count, uint64_t seed) {
	miss_machine<AlignedAllocator<struct MissingLink, sizeof(struct MissingLink)> >(link_count, access_count, seed, "miss-machine");
}

  0%|          | 0/1 [00:00<?, ?it/s]

//BEGIN
struct MissingLink {
	struct MissingLink * next;
};

extern "C"
struct MissingLink*  __attribute__((noinline)) do_misses(struct MissingLink * l, uint64_t access_count) {
	for(uint i = 0; i < access_count; i++) {
		l = l->next;
	}
	return l;
}

template<class Allocator>
void  miss_machine(uint64_t link_count, uint64_t access_count, uint64_t seed, const char * tag) {
	auto alloc = new Allocator; // create the allocator.

	exercise<Allocator>(alloc, 10000, 20, seed); // warm it up.

	std::vector<struct MissingLink *> links(link_count);  // Storage for the links
	for(auto &i : links) {                           // allocate them.
		i = alloc->alloc();
		i->next = NULL;
	}
	
	std::shuffle(links.begin(), links.end(), fast_URBG(seed));  // randomize the order of the links
	for(uint i = 0; i < links.size() -1; i++) { 
		links[i]->next = links[i+1];           // Make the next pointers reflect the ordering.
	}
	links.back()->next = links.front(); // complete the circle

	struct MissingLink * l = links[0];

	start_measurement(tag);
	l = do_misses(l, access_count); // Do the misses.
	end_measurement();

	delete alloc;

}
//END

char *x;
int *y = reinterpret_cast<int *>(x);

// or 

void * x = alloc_chunk();
T * t = reinterpret_cast<T*>(x);

#include<cassert>
T* alloc() {
    // Find the memory and initialized it, etc.

    assert(reinterpret_cast<uintptr_t>(r) % Alignment == 0);
    return r;

}

Part	value
Reading quiz	3%
Jupyter Notebook	45%
Programming Assignment	50%
Post-lab survey.	2%

Introduction¶

Grading¶

New Tools¶

Temporal Locality¶

Working Sets¶

The Three C's¶

Conflict Misses¶

The L2 and L3 Caches¶

The TLB¶

Optimizing For Locality¶

Loop Renesting¶

Loop Tiling¶

Detailed Requirements¶

Evaluation¶

`miss_machine()`¶

Useful C++¶

`reinterpret_cast<>()`¶

In-Place `new`¶

How To Do This Lab¶

The Lifecycle of a Memory Allocator¶

Thing to Try¶

Checking Invariants to Find Bugs¶

Final Measurement¶

Recap¶

Introduction¶

Grading¶

New Tools¶

Temporal Locality¶

Working Sets¶

The Three C's¶

Conflict Misses¶

The L2 and L3 Caches¶

The TLB¶

Optimizing For Locality¶

Loop Renesting¶

Loop Tiling¶

Detailed Requirements¶

Evaluation¶

miss_machine()¶

Useful C++¶

reinterpret_cast<>()¶

In-Place new¶

How To Do This Lab¶

The Lifecycle of a Memory Allocator¶

Thing to Try¶

Checking Invariants to Find Bugs¶

Final Measurement¶

Recap¶

`miss_machine()`¶

`reinterpret_cast<>()`¶

In-Place `new`¶