From 865e5bff1ea1eea451365fe04a29eb9637ec3172 Mon Sep 17 00:00:00 2001 From: Daniel Kraus Date: Wed, 2 Dec 2015 14:20:42 +0100 Subject: [PATCH] Split prescriptions regexes. --- zaaReloaded2/Medication/Prescription.cs | 41 ++++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/zaaReloaded2/Medication/Prescription.cs b/zaaReloaded2/Medication/Prescription.cs index 2a3f7d3..a4723cf 100755 --- a/zaaReloaded2/Medication/Prescription.cs +++ b/zaaReloaded2/Medication/Prescription.cs @@ -37,7 +37,7 @@ namespace zaaReloaded2.Medication /// True if the line contains prescriptions. public static bool IsPrescriptionLine(string line) { - return lineRegex.IsMatch(line); + return canonicalRegex.IsMatch(line); } #endregion @@ -55,7 +55,7 @@ namespace zaaReloaded2.Medication // Replace any runs of whitespace with a single space // (from http://stackoverflow.com/a/206946/270712) // line = Regex.Replace(line, @"\s+", " "); - Match m = lineRegex.Match(line); + Match m = unifiedRegex.Match(line); int n = m.Groups[DOSE_GROUP].Captures.Count; return new Prescription( @@ -76,7 +76,7 @@ namespace zaaReloaded2.Medication public static IList ManyFromLine(string line) { // line = Regex.Replace(line, @"\s+", " "); - MatchCollection mc = lineRegex.Matches(line); + MatchCollection mc = unifiedRegex.Matches(line); List list = new List(); foreach (Match m in mc) { @@ -197,16 +197,41 @@ namespace zaaReloaded2.Medication private const string DOSE = @"(\d\s+1/[234]|(\d\s?)?[\u00bd\u2153\u00bc]|\d+)"; private const string SPACER = @"(\s*[-\u2012\u2013\u2014]+\s*)"; - // Enclose entire regular expression in parentheses so we can use it - // to split a line and capture the delimiter. - private static readonly Regex lineRegex = new Regex( - @"(((?[^\t]+)\s+" + + /// + /// The 'canonical' regex matches a prescription the form "Ramipril 5 mg 1-0-0" + /// with or without trailing comment. + /// + /// + /// Enclose entire regular expression in parentheses so we can use it + /// with or without trailing comment. + /// + private const string canonicalPattern = + @"((?[^\t]+)\s+" + @"(?" + DOSE + @")" + SPACER + @"(?" + DOSE + @")" + SPACER + @"(?" + DOSE + @")" + @"(" + SPACER + @"(?" + DOSE + @"))?" + - @"( +(?[^\t]+))?)|((?[^\t]+)( +|\t+)(?[^\t]+)))"); + @"( +(?[^\t]+))?)"; + private static readonly Regex canonicalRegex = new Regex(canonicalPattern); + /// + /// The 'alternative' regex matches prescriptions that do not contain regular + /// dosing intervals ("1-0-0"), but free-style comments: "Cotrim forte alle 2 Tage". + /// + /// + /// Because this alternative pattern matches other lines as well (e.g. with + /// signature names), it requires special handling. + /// + private const string alternativePattern = + @"((?[^\t]+)( +|\t+)(?[^\t]+))"; + private static readonly Regex alternativeRegex = new Regex(alternativePattern); + + private static readonly Regex unifiedRegex = new Regex( + "(" + canonicalPattern + "|" + alternativePattern + ")"); + + /// + /// A 'cached', reusable regex to match several whitespace characters. + /// private static readonly Regex spaceRegex = new Regex(@"\s+"); #endregion