crl/data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

import re
import numpy as np

class LoadError(Exception):
	pass


def identify(contents):
	if contents.startswith(b"SpectraSuite Data File"):
		return "spectrasuite"
	elif b"created by Plot Digitizer" in contents:
		return "Plot Digitizer"
	else:
		return "unknown"

def identify_csv(line):
	line = line.strip().rstrip()

	point = ","
	delim = "\s+"

	if "." in line:
		point = "."
		if "," in line:
			delim = ","

	num_cols = len(re.findall(delim, line)) + 1
	if line[-1] == delim:
		num_cols -= 1

	re_int = "([\+-]?\d+)"
	re_frac = "%s?(\d*)" % point
	re_exp = "[eE]?([\+-]?\d*)"

	regex = delim.join([re_int + re_frac + re_exp] * num_cols)

	return num_cols, re.compile(regex)

def parse_csv(lines):
	num_cols, regex = identify_csv(lines[0])

	data = np.empty((len(lines), num_cols), dtype="float")

	for i, line in enumerate(lines):
		if not len(line.strip()):
			continue

		rv = regex.match(line)
		for j in range(num_cols):
			str_int = rv.group(3 * j + 1)
			str_frac = rv.group(3 * j + 2)
			str_exp = rv.group(3 * j + 3)

			fint = float(str_int)
			frac = float(str_frac) * 10 ** (-len(str_frac))
			if fint < 0:
				frac = -frac

			if str_exp != "":
				number = (fint + frac) * 10 ** (int(str_exp))
			else:
				number = fint + frac

			data[i, j] = number

	return data

re_spectrasuite_marker = re.compile("^>>>>>(.*)<<<<<")

def parse_spectrasuite(lines):
	data_start = None
	data_end = None

	for i, line in enumerate(lines):
		rv = re_spectrasuite_marker.match(line)
		if rv is not None:
			marker = rv.group(1)

			if marker == "Begin Processed Spectral Data":
				data_start = i + 1
			elif marker == "End Processed Spectral Data":
				data_end = i

	if data_start is None:
		raise ValueError("Missing 'Begin Processed Spectral Data'")
	if data_end is None:
		raise ValueError("Missing 'End Processed Spectral Data'")

	return parse_csv(lines[data_start:data_end])

def parse_plot_digitizer(lines):
	return parse_csv(lines[6:])

def load(path):
	with open(path, "rb") as fd:
		contents = fd.read()

	fmt = identify(contents)

	try:
		lines = [line.decode("ascii") for line in contents.split(b"\n")]
	except UnicodeDecodeError:
		raise LoadError("This non-ASCII data format isn't supported")

	if fmt == "spectrasuite":
		try:
			return parse_spectrasuite(lines)
		except Exception as exc:
			raise LoadError("This SpectraSuite file couldn't be understood")
	elif fmt == "Plot Digitizer":
		try:
			return parse_plot_digitizer(lines)
		except Exception as exc:
			raise LoadError("This Plot Digitizer file couldn't be understood")
	else:
		try:
			return parse_csv(lines)
		except Exception as exc:
			raise LoadError("This data format isn't supported")