Stack Overflow Asked on November 27, 2021
I need to insert 1.9 million new records into a MySQL database. To use this I’m using C# Entity Framework, but the process seems incredibly slow. At the current rate, it would take several days to process these records.
What am I doing wrong and how do I speed this up?
In the database I have 2 tables: Hashes and Categories. Each hash should be unique and can have multiple categories, with only 1 category being active per hash.
The process that I need to follow is to first check if the hash exist. If it does, then I need to find the current category, deactivate it and add the new one.
The problem is that my try{ }
statement is taking about 150ms and the block that does SaveChanges()
takes about 15-30 seconds. So, doing 1.9M records this way will take several days.
using (var reader = new StreamReader(File.OpenRead(filepath)))
using (MySQLContext db = new MySQLContext(options))
{
// Disable auto detect changes
db.ChangeTracker.AutoDetectChangesEnabled = false;
int loopCounter = 0;
string line;
// Load up the db tables in memory
var hashes = db.Hashes.Select(x => x).ToList();
var category = db.Categories.Select(a => a).ToList();
while ((line = reader.ReadLine()) != null)
{
var matches = Regex.Matches(line, "(?<MD5>[a-zA-Z0-9]+)(?<Category>[0-9])");
InputHashModel inputHash = new InputHashModel()
{
MD5 = matches[0].Groups["MD5"].Value,
Category = matches[0].Groups["Category"].Value
};
try
{
// Check if hash already exists
Hash hash = hashes.Where(h => h.MD5 == inputHash.MD5).FirstOrDefault();
// If hash doesn't exist - add it
if (hash == null)
hash = new Hash(inputHash.MD5);
else
{
// Check if category already exists
Category category = categories.Where(a => a.Active == true && a.HashId == hash.Id).FirstOrDefault();
// If it exists - deactivate it
if (category != null)
{
// If the same category already exists - proceed to next hash
if (category.Source == "ThisInput" && category.Category == inputHash.Category)
{
loopCounter++
continue;
}
category.Active = false;
category.DeactivatedTimestamp = DateTime.Now;
}
}
// Add new category
Category new_category = new Category() { Hash = hash, Source = "ThisInput", Category = inputHash.Category, Active = true);
db.Categories.Add(new_category);
// Save changes every 1000
if (loopCounter % 1000 == 0)
{
db.ChangeTracker.DetectChanges();
db.SaveChanges();
}
}
catch (Exception e)
{
Console.WriteLine("Exception: " + e);
}
loopCounter++;
}
db.ChangeTracker.AutoDetectChangesEnabled = true;
db.SaveChanges();
Console.WriteLine("Finished");
}
I managed to achieve this through the use of the BulkInsert()
and BulkUpdate()
functions from EntityFramework Extensions.
I went about it a slightly different way though ...
.Intersects(myCustomComparer)
. Do that twice to separate unique and duplicate entries into two lists.Here's the code for the functions:
public class HashMD5Comparer : IEqualityComparer<Hash>
{
//Products are equal if their names and product numbers are equal.
public bool Equals(Hash x, Hash y)
{
//Check whether the compared objects reference the same data.
if (Object.ReferenceEquals(x, y)) return true;
//Check whether any of the compared objects is null.
if (Object.ReferenceEquals(x, null) || Object.ReferenceEquals(y, null))
return false;
//Check whether the hash' properties are equal.
return x.MD5 == y.MD5;
}
// If Equals() returns true for a pair of objects
// then GetHashCode() must return the same value for these objects.
public int GetHashCode(Hash hash)
{
//Check whether the object is null
if (Object.ReferenceEquals(hash, null)) return 0;
//Get hash code for the Name field if it is not null.
int hashMD5 = hash.MD5 == null ? 0 : hash.MD5.GetHashCode();
//Calculate the hash code for the hash.
return hashMD5;
}
}
public class HashComparer
{
private static Hash[] uniqueHashes;
private static Hash[] duplicateHashes;
private static Hash[] duplicateHashesInDb;
private static void SortHashes(Hash[] hashes)
{
Hash[] hashesInDatabase;
// Download hashes from database
using (MySQLContext db = new MySQLContext())
{
hashesInDatabase = db.Hashes.Where(h => h.MD5 != null).ToArray();
}
// Find duplicates in database
duplicateHashes = hashes.Intersect(hashesInDatabase, new HashMD5Comparer()).ToArray();
duplicatehashesInDatabase = hashesInDatabase.Intersect(hashes, new HashMD5Comparer()).ToArray();
// Find uniques in database
uniqueHashes = hashes.Except(duplicateHashes, new HashMD5Comparer()).ToArray();
}
private static void ActionDuplicateHashes()
{
Assessment[] assessmentsInDatabase;
List<Assessment> assessmentsToDeactivate = new List<Assessment>();
List<Assessment> assessmentsToAdd = new List<Assessment>();
// Download assessments from database
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
var duplicateHashIds = duplicateHashesInDb.Select(h => h.Id).ToArray();
assessmentsInDatabase = db.Assessments.Where(a => duplicateHashIds.Contains(a.HashId)).ToArray();
}
foreach (var inputHash in duplicateHashes)
{
// Lookup the hash in the database to get the ID
var liveHashId = Array.Find(duplicateHashesInDb, h => h.MD5 == inputHash.MD5).Id;
// Find the assessment in the database to compare (and deactive if needed)
var liveAsssessment = Array.Find(assessmentsInDatabase, a => a.HashId == liveHashId);
// Get the new assessment of the hash
var newAssessment = inputHash.Assessments.FirstOrDefault();
if (newAssessment == null)
{
Console.WriteLine($"Failed lookup for new assessment {inputHash.MD5}");
return;
}
// Set the hashId (relationship) for the new assessment
newAssessment.HashId = liveHashId;
if (liveAsssessment != null)
{
if (liveAsssessment.Origin == newAssessment.Origin &&
liveAsssessment.Category == newAssessment.Category)
{
// Exact duplicate - leave as is
}
else
{
// Deactivate the current assessment in the database
liveAsssessment.Active = false;
// Add the assessment to a list to deactive (update)
assessmentsToDeactivate.Add(liveAsssessment);
// Add the new assessment that will be added once the old one gets deactivated
assessmentsToAdd.Add(newAssessment);
}
}
else
{
// No assessment for the hash in the database - just add a new one
assessmentsToAdd.Add(newAssessment);
}
}
// Bulk update the assessments in the database that are to be deactivated
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
db.Assessments.BulkUpdate(assessmentsToDeactivate);
}
// Bulk insert the new assessments
using (MySQLContext db = new MySQLContext(GenerateMySQLOptions()))
{
db.Assessments.BulkInsert(assessmentsToAdd);
}
}
private static void ActionUniqueHashes()
{
// Bulk insert all unique hashes and their assessments
using (MySQLContext db = new MySQLContext())
{
// options.IncludeGraph adds any relationships to the database as well
db.Hashes.BulkInsert(uniqueHashes, options => options.IncludeGraph = true);
}
}
}
There is more optimisation to be done because this uses A LOT of RAM. A specially when doing the unique hashes bulk insert (not sure why). But all in all it works.
Answered by ZeW on November 27, 2021
This is never going to be the fastest method, but at a minimum you need to aviod accumulating all the entities in the change tracker. EG after each SaveChanges() run
foreach (var e in db.ChangeTracker.Entries())
{
e.State = EntityState.Detached;
}
Answered by David Browne - Microsoft on November 27, 2021
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP