sppu-practical

RMAG news

cuda mat mul

#include <cuda_runtime.h>
#include <iostream>

__global__ void matmul(int* A, int* B, int* C, int N) {
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (Row < N && Col < N) {
int Pvalue = 0;
for (int k = 0; k < N; k++) {
Pvalue += A[Row*N+k] * B[k*N+Col];
}
C[Row*N+Col] = Pvalue;
}
}

int main() {
int N = 512;
int size = N * N * sizeof(int);
int* A, * B, * C;
int* dev_A, * dev_B, * dev_C;
cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);

// Initialize matrices A and B
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i*N+j] = i*N+j;
B[i*N+j] = j*N+i;
}
}

cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

dim3 dimBlock(16, 16);
dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);

matmul<<<dimGrid, dimBlock>>>(dev_A, dev_B, dev_C, N);

cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

// Print the result
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 10; j++) {
std::cout << C[i*N+j] << ” “;
}
std::cout << std::endl;
}

// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);

return 0;
}

cuda vec add

// program 2

#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void addVectors(int* A, int* B, int* C, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
C[i] = A[i] + B[I];
}
}

int main()
{
int n = 1000000;
int* A, * B, * C;
int size = n * sizeof(int);

// Allocate memory on the host
cudaMallocHost(&A, size);
cudaMallocHost(&B, size);
cudaMallocHost(&C, size);

// Initialize the vectors
for (int i = 0; i < n; i++)
{
A[i] = I;
B[i] = i * 2;
}
// Allocate memory on the device
int* dev_A, * dev_B, * dev_C;
cudaMalloc(&dev_A, size);
cudaMalloc(&dev_B, size);
cudaMalloc(&dev_C, size);

// Copy data from host to device
cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

// Launch the kernel
int blockSize = 256;
int numBlocks = (n + blockSize – 1) / blockSize;
addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);

// Copy data from device to host
cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

// Print the results
for (int i = 0; i < 10; i++)
{
cout << C[i] << ” “;
}
cout << endl;

// Free memory
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
cudaFreeHost(A);
cudaFreeHost(B);
cudaFreeHost(C);

return 0;
}

cpp bfs dfs

#include <iostream>
#include <vector>
#include <queue>
#include <stack>
#include <omp.h>

using namespace std;

class Graph {
int V;
vector<vector<int>> adjList;

public:
Graph(int V) {
this->V = V;
adjList.resize(V);
}

void addEdge(int src, int dest) {
adjList[src].push_back(dest);
adjList[dest].push_back(src); // For undirected graph
}

vector<int> getNeighbors(int vertex) {
return adjList[vertex];
}
};

void parallelBFS(Graph& graph, int source, vector<bool>& visited) {
queue<int> q;
q.push(source);
visited[source] = true;

while (!q.empty()) {
int current = q.front();
q.pop();
cout << “Visited: ” << current << endl;

vector<int> neighbors = graph.getNeighbors(current);
#pragma omp parallel for
for (int i = 0; i < neighbors.size(); ++i) {
int neighbor = neighbors[i];
if (!visited[neighbor]) {
visited[neighbor] = true;
q.push(neighbor);
}
}
}
}

void parallelDFS(Graph& graph, int source, vector<bool>& visited) {
stack<int> s;
s.push(source);
visited[source] = true;

while (!s.empty()) {
int current = s.top();
s.pop();
cout << “Visited: ” << current << endl;

vector<int> neighbors = graph.getNeighbors(current);
#pragma omp parallel for
for (int i = 0; i < neighbors.size(); ++i) {
int neighbor = neighbors[i];
if (!visited[neighbor]) {
visited[neighbor] = true;
s.push(neighbor);
}
}
}
}

int main() {
int V, E;
cout << “Enter the number of vertices: “;
cin >> V;
Graph graph(V);
cout << “Enter the number of edges: “;
cin >> E;
cout << “Enter the edges (src dest):” << endl;
for (int i = 0; i < E; ++i) {
int src, dest;
cin >> src >> dest;
graph.addEdge(src, dest);
}

vector<bool> visited(V, false);

cout << “Parallel BFS:” << endl;
#pragma omp parallel num_threads(2)
{
#pragma omp single nowait
parallelBFS(graph, 0, visited);
}

// Reset visited array for DFS
fill(visited.begin(), visited.end(), false);

cout << “Parallel DFS:” << endl;
#pragma omp parallel num_threads(2)
{
#pragma omp single nowait
parallelDFS(graph, 0, visited);
}

return 0;
}

Boston

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten
from sklearn import preprocessing

(X_train, Y_train), (X_test, Y_test) = keras.datasets.boston_housing.load_data()

print(“Training data shape:”, X_train.shape)
print(“Test data shape:”, X_test.shape)
print(“Train output data shape:”, Y_train.shape)
print(“Actual Test output data shape:”, Y_test.shape)

##Normalize the data

X_train=preprocessing.normalize(X_train)
X_test=preprocessing.normalize(X_test)

#Model Building

X_train[0].shape
model = Sequential()
model.add(Dense(128,activation=’relu’,input_shape= X_train[0].shape))
model.add(Dense(64,activation=’relu’))
model.add(Dense(32,activation=’relu’))
model.add(Dense(1))

model.summary()

model.compile(loss=’mse’,optimizer=’rmsprop’,metrics=[‘mae’])

history = model.fit(X_train,Y_train,epochs=100,batch_size=1,verbose=1,validation_data=(X_test,Y_test))

results = model.evaluate(X_test, Y_test)
print(results)

fashion

from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

fashion_mnist = keras.datasets.fashion_mnist
(train_img, train_labels), (test_img, test_labels) = fashion_mnist.load_data()

train_img = train_img / 255.0
test_img = test_img / 255.0

model = keras.Sequential([keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(128, activation=’relu’),
keras.layers.Dense(10, activation=’softmax’)])

model.compile(optimizer=’adam’, loss=’sparse_categorical_crossentropy’,
metrics=[‘accuracy’])

model.fit(train_img, train_labels, epochs=10)

test_loss, test_acc = model.evaluate(test_img, test_labels)
print(“accuracy of tessting: “,test_acc)

predictions = model.predict(test_img)

predicted_labels = np.argmax(predictions, axis=1)

num_rows = 5
num_cols = 5
num_imgs = num_rows*num_cols

plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_imgs):
plt.subplot(num_rows, 2*num_cols, 2*i+1)
plt.imshow(test_img[1], cmap=’gray’)
plt.axis(“off”)
plt.subplot(num_rows, 2*num_cols, 2*i+2)
plt.bar(range(10), predictions[i])
plt.xticks(range(10))
plt.ylim([0,1])
plt.tight_layout()
plt.title(f”predicted_labels: {predicted_labels[i]}”)
plt.show()

imdb

from keras.datasets import imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

max([max(sequence) for sequence in train_data])

word_index = imdb.get_word_index()
reverse_word_index = dict([(val, key) for (key, val) in word_index.items()])
decoded_review = ‘ ‘.join([reverse_word_index.get(i-3, ‘?’) for i in train_data[0]])

import numpy as np

def vectorize(sequences, dimension=10000):
results = np.zeros((len(sequences), dimension))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1
return results

x_train = vectorize(train_data)
x_test = vectorize(test_data)
y_train = np.asarray(train_labels).astype(‘float32’)
y_test = np.asarray(test_labels).astype(‘float32’)

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation=’relu’, input_shape=(10000,)))
model.add(layers.Dense(16, activation=’relu’))
model.add(layers.Dense(1, activation=’sigmoid’))

model.compile(loss = ‘binary_crossentropy’,optimizer=’rmsprop’, metrics = [‘accuracy’])

x_val = x_train[:10000]
y_val = y_train[:10000]

partial_x = x_train[10000:]
partial_y = y_train[10000:]

history = model.fit(partial_x, partial_y, epochs=20, batch_size=512, validation_data=(x_val, y_val))
results = model.evaluate(x_test, y_test)
print(results)

min-max cpp

#include <iostream>
#include <vector>
#include <omp.h>
#include <climits>

using namespace std;

void min_reduction(vector<int>& arr) {
int min_value = INT_MAX;
#pragma omp parallel for reduction(min: min_value)
for (int i = 0; i < arr.size(); i++) {
if (arr[i] < min_value) {
min_value = arr[i];
}
}
cout << “Minimum value: ” << min_value << endl;
}

void max_reduction(vector<int>& arr) {
int max_value = INT_MIN;
#pragma omp parallel for reduction(max: max_value)
for (int i = 0; i < arr.size(); i++) {
if (arr[i] > max_value) {
max_value = arr[i];
}
}
cout << “Maximum value: ” << max_value << endl;
}

void sum_reduction(vector<int>& arr) {
int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < arr.size(); i++) {
sum += arr[i];
}
cout << “Sum: ” << sum << endl;
}

void average_reduction(vector<int>& arr) {
int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < arr.size(); i++) {
sum += arr[i];
}
cout << “Average: ” << (double)sum / arr.size() << endl;
}

int main() {
vector<int> arr;
arr.push_back(5);
arr.push_back(2);
arr.push_back(9);
arr.push_back(1);
arr.push_back(7);
arr.push_back(6);
arr.push_back(8);
arr.push_back(3);
arr.push_back(4);

min_reduction(arr);
max_reduction(arr);
sum_reduction(arr);
average_reduction(arr);
}

sorting

#include <iostream>
#include <vector>
#include <cstdlib>
#include <omp.h>

// Function to perform bubble sort
void bubbleSort(std::vector<int>& arr) {
int n = arr.size();
for (int i = 0; i < n-1; i++) {
for (int j = 0; j < n-i-1; j++) {
if (arr[j] > arr[j+1]) {
std::swap(arr[j], arr[j+1]);
}
}
}
}

// Function to merge two sorted subvectors
void merge(std::vector<int>& arr, int l, int m, int r) {
int n1 = m – l + 1;
int n2 = r – m;

std::vector<int> L(n1), R(n2);

for (int i = 0; i < n1; i++)
L[i] = arr[l + i];
for (int j = 0; j < n2; j++)
R[j] = arr[m + 1 + j];

int i = 0, j = 0, k = l;
while (i < n1 && j < n2) {
if (L[i] <= R[j]) {
arr[k] = L[i];
i++;
} else {
arr[k] = R[j];
j++;
}
k++;
}

while (i < n1) {
arr[k] = L[i];
i++;
k++;
}

while (j < n2) {
arr[k] = R[j];
j++;
k++;
}
}

// Function to perform merge sort
void mergeSort(std::vector<int>& arr, int l, int r) {
if (l < r) {
int m = l + (r – l) / 2;

#pragma omp parallel sections
{
#pragma omp section
mergeSort(arr, l, m);
#pragma omp section
mergeSort(arr, m + 1, r);
}

merge(arr, l, m, r);
}
}

// Function to print a vector
void printVector(const std::vector<int>& arr) {
for (int num : arr)
std::cout << num << ” “;
std::cout << std::endl;
}

int main() {
int n = 10000; // Size of vector
std::vector<int> arr(n), arr_copy(n);

// Initialize vector with random values
srand(42);
for (int i = 0; i < n; i++) {
arr[i] = rand() % 10000;
arr_copy[i] = arr[i];
}

std::cout << “Original vector:” << std::endl;
printVector(arr);

// Sequential bubble sort
double start = omp_get_wtime();
bubbleSort(arr);
double end = omp_get_wtime();
std::cout << “nSequential Bubble Sort: ” << end – start << ” seconds” << std::endl;
//printVector(arr);

// Parallel merge sort
start = omp_get_wtime();
mergeSort(arr_copy, 0, n – 1);
end = omp_get_wtime();
std::cout << “nParallel Merge Sort: ” << end – start << ” seconds” << std::endl;
//printVector(arr_copy);

return 0;
}

Leave a Reply

Your email address will not be published. Required fields are marked *